diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index be3c4f0b1564c..5917c1497d80e 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -10,7 +10,7 @@ // replace them with smaller constant pool entries, including: // * Converting AVX512 memory-fold instructions to their broadcast-fold form // * Broadcasting of full width loads. -// * TODO: Sign/Zero extension of full width loads. +// * TODO: Zero extension of full width loads. // //===----------------------------------------------------------------------===// @@ -265,11 +265,47 @@ static Constant *rebuildZeroUpperCst(const Constant *C, unsigned /*NumElts*/, return nullptr; } +static Constant *rebuildExtCst(const Constant *C, bool IsSExt, unsigned NumElts, + unsigned SrcEltBitWidth) { + Type *Ty = C->getType(); + unsigned NumBits = Ty->getPrimitiveSizeInBits(); + unsigned DstEltBitWidth = NumBits / NumElts; + assert((NumBits % NumElts) == 0 && (NumBits % SrcEltBitWidth) == 0 && + (DstEltBitWidth % SrcEltBitWidth) == 0 && + (DstEltBitWidth > SrcEltBitWidth) && "Illegal extension width"); + + if (std::optional Bits = extractConstantBits(C)) { + assert((Bits->getBitWidth() / DstEltBitWidth) == NumElts && + (Bits->getBitWidth() % DstEltBitWidth) == 0 && + "Unexpected constant extension"); + + // Ensure every vector element can be represented by the src bitwidth. + APInt TruncBits = APInt::getZero(NumElts * SrcEltBitWidth); + for (unsigned I = 0; I != NumElts; ++I) { + APInt Elt = Bits->extractBits(DstEltBitWidth, I * DstEltBitWidth); + if ((IsSExt && Elt.getSignificantBits() > SrcEltBitWidth) || + (!IsSExt && Elt.getActiveBits() > SrcEltBitWidth)) + return nullptr; + TruncBits.insertBits(Elt.trunc(SrcEltBitWidth), I * SrcEltBitWidth); + } + + return rebuildConstant(Ty->getContext(), Ty->getScalarType(), TruncBits, + SrcEltBitWidth); + } + + return nullptr; +} +static Constant *rebuildSExtCst(const Constant *C, unsigned NumElts, + unsigned SrcEltBitWidth) { + return rebuildExtCst(C, true, NumElts, SrcEltBitWidth); +} + bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, MachineInstr &MI) { unsigned Opc = MI.getOpcode(); MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); + bool HasSSE41 = ST->hasSSE41(); bool HasAVX2 = ST->hasAVX2(); bool HasDQI = ST->hasDQI(); bool HasBWI = ST->hasBWI(); @@ -312,7 +348,15 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, return false; }; - // Attempt to convert full width vector loads into broadcast/vzload loads. + // Attempt to detect a suitable vzload/broadcast/vextload from increasing + // constant bitwidths. Prefer vzload/broadcast/vextload for same bitwidth: + // - vzload shouldn't ever need a shuffle port to zero the upper elements and + // the fp/int domain versions are equally available so we don't introduce a + // domain crossing penalty. + // - broadcast sometimes need a shuffle port (especially for 8/16-bit + // variants), AVX1 only has fp domain broadcasts but AVX2+ have good fp/int + // domain equivalents. + // - vextload always needs a shuffle port and is only ever int domain. switch (Opc) { /* FP Loads */ case X86::MOVAPDrm: @@ -370,22 +414,34 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, /* Integer Loads */ case X86::MOVDQArm: case X86::MOVDQUrm: { - return FixupConstant({{X86::MOVDI2PDIrm, 1, 32, rebuildZeroUpperCst}, - {X86::MOVQI2PQIrm, 1, 64, rebuildZeroUpperCst}}, - 1); + FixupEntry Fixups[] = { + {HasSSE41 ? X86::PMOVSXBQrm : 0, 2, 8, rebuildSExtCst}, + {X86::MOVDI2PDIrm, 1, 32, rebuildZeroUpperCst}, + {HasSSE41 ? X86::PMOVSXBDrm : 0, 4, 8, rebuildSExtCst}, + {HasSSE41 ? X86::PMOVSXWQrm : 0, 2, 16, rebuildSExtCst}, + {X86::MOVQI2PQIrm, 1, 64, rebuildZeroUpperCst}, + {HasSSE41 ? X86::PMOVSXBWrm : 0, 8, 8, rebuildSExtCst}, + {HasSSE41 ? X86::PMOVSXWDrm : 0, 4, 16, rebuildSExtCst}, + {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst}}; + return FixupConstant(Fixups, 1); } case X86::VMOVDQArm: case X86::VMOVDQUrm: { FixupEntry Fixups[] = { {HasAVX2 ? X86::VPBROADCASTBrm : 0, 1, 8, rebuildSplatCst}, {HasAVX2 ? X86::VPBROADCASTWrm : 0, 1, 16, rebuildSplatCst}, + {X86::VPMOVSXBQrm, 2, 8, rebuildSExtCst}, {X86::VMOVDI2PDIrm, 1, 32, rebuildZeroUpperCst}, {HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDrm, 4, 8, rebuildSExtCst}, + {X86::VPMOVSXWQrm, 2, 16, rebuildSExtCst}, {X86::VMOVQI2PQIrm, 1, 64, rebuildZeroUpperCst}, {HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}, - }; + {X86::VPMOVSXBWrm, 8, 8, rebuildSExtCst}, + {X86::VPMOVSXWDrm, 4, 16, rebuildSExtCst}, + {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst}}; return FixupConstant(Fixups, 1); } case X86::VMOVDQAYrm: @@ -395,10 +451,16 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {HasAVX2 ? X86::VPBROADCASTWYrm : 0, 1, 16, rebuildSplatCst}, {HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst}, + {HasAVX2 ? X86::VPMOVSXBQYrm : 0, 4, 8, rebuildSExtCst}, {HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst}, + {HasAVX2 ? X86::VPMOVSXBDYrm : 0, 8, 8, rebuildSExtCst}, + {HasAVX2 ? X86::VPMOVSXWQYrm : 0, 4, 16, rebuildSExtCst}, {HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm, 1, 128, - rebuildSplatCst}}; + rebuildSplatCst}, + {HasAVX2 ? X86::VPMOVSXBWYrm : 0, 16, 8, rebuildSExtCst}, + {HasAVX2 ? X86::VPMOVSXWDYrm : 0, 8, 16, rebuildSExtCst}, + {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst}}; return FixupConstant(Fixups, 1); } case X86::VMOVDQA32Z128rm: @@ -408,10 +470,16 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, FixupEntry Fixups[] = { {HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1, 8, rebuildSplatCst}, {HasBWI ? X86::VPBROADCASTWZ128rm : 0, 1, 16, rebuildSplatCst}, + {X86::VPMOVSXBQZ128rm, 2, 8, rebuildSExtCst}, {X86::VMOVDI2PDIZrm, 1, 32, rebuildZeroUpperCst}, {X86::VPBROADCASTDZ128rm, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDZ128rm, 4, 8, rebuildSExtCst}, + {X86::VPMOVSXWQZ128rm, 2, 16, rebuildSExtCst}, {X86::VMOVQI2PQIZrm, 1, 64, rebuildZeroUpperCst}, - {X86::VPBROADCASTQZ128rm, 1, 64, rebuildSplatCst}}; + {X86::VPBROADCASTQZ128rm, 1, 64, rebuildSplatCst}, + {HasBWI ? X86::VPMOVSXBWZ128rm : 0, 8, 8, rebuildSExtCst}, + {X86::VPMOVSXWDZ128rm, 4, 16, rebuildSExtCst}, + {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst}}; return FixupConstant(Fixups, 1); } case X86::VMOVDQA32Z256rm: @@ -422,8 +490,14 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1, 8, rebuildSplatCst}, {HasBWI ? X86::VPBROADCASTWZ256rm : 0, 1, 16, rebuildSplatCst}, {X86::VPBROADCASTDZ256rm, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBQZ256rm, 4, 8, rebuildSExtCst}, {X86::VPBROADCASTQZ256rm, 1, 64, rebuildSplatCst}, - {X86::VBROADCASTI32X4Z256rm, 1, 128, rebuildSplatCst}}; + {X86::VPMOVSXBDZ256rm, 8, 8, rebuildSExtCst}, + {X86::VPMOVSXWQZ256rm, 4, 16, rebuildSExtCst}, + {X86::VBROADCASTI32X4Z256rm, 1, 128, rebuildSplatCst}, + {HasBWI ? X86::VPMOVSXBWZ256rm : 0, 16, 8, rebuildSExtCst}, + {X86::VPMOVSXWDZ256rm, 8, 16, rebuildSExtCst}, + {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst}}; return FixupConstant(Fixups, 1); } case X86::VMOVDQA32Zrm: @@ -435,8 +509,14 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {HasBWI ? X86::VPBROADCASTWZrm : 0, 1, 16, rebuildSplatCst}, {X86::VPBROADCASTDZrm, 1, 32, rebuildSplatCst}, {X86::VPBROADCASTQZrm, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXBQZrm, 8, 8, rebuildSExtCst}, {X86::VBROADCASTI32X4rm, 1, 128, rebuildSplatCst}, - {X86::VBROADCASTI64X4rm, 1, 256, rebuildSplatCst}}; + {X86::VPMOVSXBDZrm, 16, 8, rebuildSExtCst}, + {X86::VPMOVSXWQZrm, 8, 16, rebuildSExtCst}, + {X86::VBROADCASTI64X4rm, 1, 256, rebuildSplatCst}, + {HasBWI ? X86::VPMOVSXBWZrm : 0, 32, 8, rebuildSExtCst}, + {X86::VPMOVSXWDZrm, 16, 16, rebuildSExtCst}, + {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst}}; return FixupConstant(Fixups, 1); } } diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 58ebe023cd61e..55d7868c53053 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1582,6 +1582,36 @@ static void printBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer, } } +static bool printSignExtend(const MachineInstr *MI, MCStreamer &OutStreamer, + int SrcEltBits, int DstEltBits) { + auto *C = X86::getConstantFromPool(*MI, 1); + if (C && C->getType()->getScalarSizeInBits() == SrcEltBits) { + if (auto *CDS = dyn_cast(C)) { + int NumElts = CDS->getNumElements(); + std::string Comment; + raw_string_ostream CS(Comment); + + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + CS << "["; + for (int i = 0; i != NumElts; ++i) { + if (i != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) { + APInt Elt = CDS->getElementAsAPInt(i).sext(DstEltBits); + printConstant(Elt, CS); + } else + CS << "?"; + } + CS << "]"; + OutStreamer.AddComment(CS.str()); + return true; + } + } + + return false; +} + void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); assert((getSubtarget().isOSWindows() || TM.getTargetTriple().isUEFI()) && @@ -1844,7 +1874,7 @@ static void addConstantComments(const MachineInstr *MI, case X86::VMOVQI2PQIrm: case X86::VMOVQI2PQIZrm: printZeroUpperMove(MI, OutStreamer, 64, 128, "mem[0],zero"); - break; + break; case X86::MOVSSrm: case X86::VMOVSSrm: @@ -1979,6 +2009,36 @@ static void addConstantComments(const MachineInstr *MI, case X86::VPBROADCASTBZrm: printBroadcast(MI, OutStreamer, 64, 8); break; + +#define MOVX_CASE(Prefix, Ext, Type, Suffix) \ + case X86::Prefix##PMOV##Ext##Type##Suffix##rm: + +#define CASE_MOVX_RM(Ext, Type) \ + MOVX_CASE(, Ext, Type, ) \ + MOVX_CASE(V, Ext, Type, ) \ + MOVX_CASE(V, Ext, Type, Y) \ + MOVX_CASE(V, Ext, Type, Z128) \ + MOVX_CASE(V, Ext, Type, Z256) \ + MOVX_CASE(V, Ext, Type, Z) + + CASE_MOVX_RM(SX, BD) + printSignExtend(MI, OutStreamer, 8, 32); + break; + CASE_MOVX_RM(SX, BQ) + printSignExtend(MI, OutStreamer, 8, 64); + break; + CASE_MOVX_RM(SX, BW) + printSignExtend(MI, OutStreamer, 8, 16); + break; + CASE_MOVX_RM(SX, DQ) + printSignExtend(MI, OutStreamer, 32, 64); + break; + CASE_MOVX_RM(SX, WD) + printSignExtend(MI, OutStreamer, 16, 32); + break; + CASE_MOVX_RM(SX, WQ) + printSignExtend(MI, OutStreamer, 16, 64); + break; } } diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index e592b714a05dc..4242d8483e723 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -750,7 +750,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -761,7 +761,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax @@ -868,7 +868,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -881,7 +881,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -968,7 +968,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -978,7 +978,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 @@ -989,7 +989,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 @@ -1001,7 +1001,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1450,7 +1450,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -1466,8 +1466,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1640,7 +1639,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1759,7 +1758,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1875,7 +1874,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1984,7 +1983,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2010,7 +2009,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2032,7 +2031,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2136,7 +2135,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2162,7 +2161,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2184,7 +2183,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2288,7 +2287,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2314,7 +2313,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2336,7 +2335,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -3226,7 +3225,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 @@ -3245,8 +3244,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3354,7 +3352,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3369,7 +3367,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -3544,7 +3542,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 @@ -3677,7 +3675,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,26,0,28,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,25,26,0,28,29,0,31] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 @@ -3940,7 +3938,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,u,u,u,u,16,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 @@ -4180,7 +4178,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,u,u,u,u,16,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4279,7 +4277,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4296,7 +4294,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4310,7 +4308,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4392,7 +4390,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 @@ -4408,7 +4406,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4425,7 +4423,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4439,7 +4437,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -4452,7 +4450,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4545,7 +4543,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4562,7 +4560,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4575,7 +4573,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4646,7 +4644,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -4660,7 +4658,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -4674,7 +4672,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -4685,7 +4683,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4695,7 +4693,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] @@ -4793,7 +4791,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4810,7 +4808,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4836,8 +4834,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] -; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4923,7 +4920,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -4937,7 +4934,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -4948,7 +4945,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] ; AVX512BW-SLOW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4958,8 +4955,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] -; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index d3f6bd20a0127..c40053dcb3e70 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -643,7 +643,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -652,7 +652,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] @@ -737,7 +737,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -747,7 +747,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -1190,7 +1190,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -1202,8 +1202,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1339,7 +1338,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1428,7 +1427,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1515,7 +1514,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1678,7 +1677,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1697,7 +1696,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1716,7 +1715,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1791,7 +1790,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1810,7 +1809,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1829,7 +1828,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2594,7 +2593,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0 @@ -2613,8 +2612,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2706,7 +2704,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2718,7 +2716,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -2730,7 +2728,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX512F-NEXT: vpternlogq $202, (%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2742,7 +2740,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $202, (%rdi), %xmm0, %xmm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2869,7 +2867,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2974,7 +2972,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,0,28,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,0,28,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3077,7 +3075,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3187,7 +3185,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3293,7 +3291,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3384,7 +3382,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3465,7 +3463,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3477,7 +3475,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3489,7 +3487,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3557,7 +3555,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3568,7 +3566,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3580,7 +3578,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3592,7 +3590,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3670,7 +3668,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3682,7 +3680,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3694,7 +3692,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3751,7 +3749,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -3760,7 +3758,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -3769,7 +3767,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3778,7 +3776,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3855,7 +3853,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3867,7 +3865,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3879,7 +3877,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3946,7 +3944,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -3955,7 +3953,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3964,7 +3962,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 3e7d1138132c4..6687346604adf 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1191,7 +1191,7 @@ define void @avg_v16i16_const(ptr %a) nounwind { ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) @@ -1241,7 +1241,7 @@ define void @avg_v32i16_const(ptr %a) nounwind { ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll index abfe24ff9c50e..a11b92a663c45 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -651,7 +651,7 @@ entry: define <8 x i32> @ld0_hi0_lo1_8i32(ptr %pa, <8 x i32> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: ld0_hi0_lo1_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,3,4] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -672,7 +672,7 @@ entry: define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, ptr %pb) nounwind uwtable readnone ssp { ; AVX1-LABEL: ld1_hi0_hi1_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,3,4] ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index d32143cf33f2f..d8452d105ae55 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) { define <8 x i32> @mul_const9(<8 x i32> %x) { ; CHECK-LABEL: mul_const9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,0] ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %y = mul <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index 3ab489ae05743..c5243a5c18a2d 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1062,56 +1062,56 @@ define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psllv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1140,14 +1140,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1155,14 +1155,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1170,14 +1170,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1185,14 +1185,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1221,36 +1221,36 @@ define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) { define <2 x i64> @test_x86_avx2_psllv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4294967295,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4294967295,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,18446744073709551615] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [4,18446744073709551615] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1277,36 +1277,36 @@ define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) { define <4 x i64> @test_x86_avx2_psllv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4294967295,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4294967295,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1333,14 +1333,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1348,14 +1348,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1363,14 +1363,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1378,14 +1378,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1415,14 +1415,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1430,14 +1430,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1445,14 +1445,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1460,14 +1460,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1497,8 +1497,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) { define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1506,8 +1506,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,0,4,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1515,8 +1515,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -1554,8 +1554,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) { define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1563,8 +1563,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1610,36 +1610,36 @@ define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psrav_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1665,36 +1665,36 @@ define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psrav_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll index 8fb7c65a9a60b..4bf2e2456482e 100644 --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -60,14 +60,14 @@ define <8 x i32> @test_vpslld_var(i32 %shift) { ; X86-LABEL: test_vpslld_var: ; X86: # %bb.0: ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X86-NEXT: vpmovsxwd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] ; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_vpslld_var: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X64-NEXT: vpmovsxwd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %amt = insertelement <8 x i32> undef, i32 %shift, i32 0 @@ -276,8 +276,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; X86-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; X86-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -307,8 +306,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; X64-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; X64-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 25e297993bd7c..0fae921b1ca83 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -293,7 +293,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512F-LABEL: imulq128_bcast: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -313,7 +313,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; ; AVX512BW-LABEL: imulq128_bcast: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -324,7 +324,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512DQ-LABEL: imulq128_bcast: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 6cbb5e0d3d6d3..b77c753107a6e 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -7107,18 +7107,18 @@ define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psllv_d_512_const() { ; X64-LABEL: test_x86_avx512_psllv_d_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psllv_d_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7180,18 +7180,18 @@ define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psllv_q_512_const() { ; X64-LABEL: test_x86_avx512_psllv_q_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psllv_q_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0] +; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295] +; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7355,18 +7355,18 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psrlv_d_512_const() { ; X64-LABEL: test_x86_avx512_psrlv_d_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psrlv_d_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7428,18 +7428,18 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psrlv_q_512_const() { ; X64-LABEL: test_x86_avx512_psrlv_q_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psrlv_q_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0] +; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295] +; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index b648b086a8b68..17ac3e2e6364d 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -7,7 +7,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -18,7 +18,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -33,7 +33,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -47,7 +47,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -62,7 +62,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -76,7 +76,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -91,7 +91,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -105,7 +105,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -116,7 +116,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -131,7 +131,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -146,7 +146,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -157,7 +157,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} @@ -173,7 +173,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -189,7 +189,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} @@ -205,7 +205,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -221,7 +221,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} @@ -237,7 +237,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -253,7 +253,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -264,7 +264,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} @@ -280,7 +280,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -296,7 +296,7 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -307,7 +307,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} @@ -322,7 +322,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 @@ -336,7 +336,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} @@ -351,7 +351,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 @@ -365,7 +365,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} @@ -380,7 +380,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 @@ -394,7 +394,7 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -405,7 +405,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} @@ -420,7 +420,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -433,7 +433,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -445,7 +445,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 @@ -461,7 +461,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} @@ -476,7 +476,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 @@ -492,7 +492,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} @@ -507,7 +507,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 @@ -523,7 +523,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} @@ -538,7 +538,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -550,7 +550,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 @@ -566,7 +566,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} @@ -582,7 +582,7 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -593,7 +593,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} @@ -609,7 +609,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -625,7 +625,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} @@ -641,7 +641,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -657,7 +657,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} @@ -673,7 +673,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -689,7 +689,7 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -700,7 +700,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} @@ -716,7 +716,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -731,7 +731,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -744,7 +744,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 @@ -761,7 +761,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -778,7 +778,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 @@ -795,7 +795,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -812,7 +812,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 @@ -829,7 +829,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -846,7 +846,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -859,7 +859,7 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 @@ -876,7 +876,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -893,7 +893,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -916,7 +916,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -931,7 +931,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,3,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -945,7 +945,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -960,7 +960,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,7,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1012,7 +1012,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1027,7 +1027,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,3,2,5] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1082,7 +1082,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,0,0,3] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1098,7 +1098,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1114,7 +1114,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0] ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1130,7 +1130,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1146,7 +1146,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp @@ -1157,7 +1157,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1173,7 +1173,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,1,2,7] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1198,7 +1198,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1212,7 +1212,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1225,7 +1225,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1239,7 +1239,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1252,7 +1252,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1266,7 +1266,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1289,7 +1289,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1303,7 +1303,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1327,7 +1327,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1342,7 +1342,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1356,7 +1356,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 @@ -1372,7 +1372,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1387,7 +1387,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1402,7 +1402,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,1,13,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1427,7 +1427,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} @@ -1442,7 +1442,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,0,13] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1466,7 +1466,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -1480,7 +1480,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1495,7 +1495,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} @@ -1511,7 +1511,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1527,7 +1527,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} @@ -1543,7 +1543,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1559,7 +1559,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1570,7 +1570,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} @@ -1586,7 +1586,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1601,7 +1601,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1614,7 +1614,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 @@ -1631,7 +1631,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -1649,8 +1649,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,5,3,2,15,5,3,2] -; CHECK-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1667,8 +1666,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,5,3,2,15,5,3,2] -; CHECK-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1684,7 +1682,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 @@ -1701,7 +1699,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -1719,7 +1717,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6] ; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1730,7 +1728,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6] ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1746,7 +1744,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1769,11 +1767,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,u,2] +; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2] ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] +; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3] ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vzeroupper ; CHECK-FAST-PERLANE-NEXT: retq @@ -1951,7 +1949,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -1974,7 +1972,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1995,8 +1993,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,3,6,3] -; CHECK-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2019,8 +2016,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3] -; CHECK-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2058,7 +2054,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2081,7 +2077,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,0,7] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2102,7 +2098,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2125,7 +2121,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2146,7 +2142,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2160,7 +2156,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,1,0,6] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2190,7 +2186,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2213,7 +2209,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2236,7 +2232,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,0,3,4] ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} @@ -2250,7 +2246,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2365,7 +2361,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2389,7 +2385,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2413,7 +2409,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2437,7 +2433,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2461,7 +2457,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; @@ -2479,7 +2475,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,0,2] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2503,7 +2499,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2527,7 +2523,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> % ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,6,1] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2543,7 +2539,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -2559,7 +2555,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,7,1] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2583,7 +2579,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2607,7 +2603,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp @@ -2618,7 +2614,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,2,3,2] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2634,7 +2630,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -2650,7 +2646,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} @@ -2674,7 +2670,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2715,7 +2711,7 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1] ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2 ; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} @@ -2739,7 +2735,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1] ; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll index 884ff6ccf675e..2103ab87a17ad 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll @@ -4,7 +4,7 @@ define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -13,7 +13,7 @@ define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -27,7 +27,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -39,7 +39,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -53,7 +53,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -65,7 +65,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -79,7 +79,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -91,7 +91,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -100,7 +100,7 @@ define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -114,7 +114,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -126,7 +126,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -136,7 +136,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask0(ptr %vp) { define <16 x i16> @test_masked_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -150,7 +150,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -164,7 +164,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask define <16 x i16> @test_masked_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -178,7 +178,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -192,7 +192,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask define <16 x i16> @test_masked_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -206,7 +206,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -220,7 +220,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask define <16 x i16> @test_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -230,7 +230,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask3(ptr %vp) { define <16 x i16> @test_masked_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -244,7 +244,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -258,7 +258,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -267,7 +267,7 @@ define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -281,7 +281,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -293,7 +293,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -307,7 +307,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -319,7 +319,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -333,7 +333,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -345,7 +345,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -354,7 +354,7 @@ define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -368,7 +368,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -380,7 +380,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -390,7 +390,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask0(ptr %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -404,7 +404,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -418,7 +418,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %mask define <32 x i16> @test_masked_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -432,7 +432,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -446,7 +446,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %mask define <32 x i16> @test_masked_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -460,7 +460,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -474,7 +474,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %mask define <32 x i16> @test_32xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -484,7 +484,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask3(ptr %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -498,7 +498,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -521,7 +521,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -535,7 +535,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -547,7 +547,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -561,7 +561,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -573,7 +573,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -587,7 +587,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -608,7 +608,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -622,7 +622,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -644,7 +644,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(ptr %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -658,7 +658,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -672,7 +672,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { define <8 x i32> @test_masked_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -686,7 +686,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -700,7 +700,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { define <8 x i32> @test_masked_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -714,7 +714,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -738,7 +738,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(ptr %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -752,7 +752,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -775,7 +775,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -789,7 +789,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -801,7 +801,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -815,7 +815,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -827,7 +827,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -841,7 +841,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -862,7 +862,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -876,7 +876,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -898,7 +898,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(ptr %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -912,7 +912,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -926,7 +926,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %mask define <16 x i32> @test_masked_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -940,7 +940,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -954,7 +954,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %mask define <16 x i32> @test_masked_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -968,7 +968,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -992,7 +992,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(ptr %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1006,7 +1006,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1263,7 +1263,7 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1277,7 +1277,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1313,7 +1313,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1327,7 +1327,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1371,7 +1371,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1385,7 +1385,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1430,7 +1430,7 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1444,7 +1444,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1490,7 +1490,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(ptr %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1504,7 +1504,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1544,7 +1544,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(ptr %vp, <8 x i64> %mas define <8 x i64> @test_masked_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1558,7 +1558,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1607,7 +1607,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(ptr %vp, <8 x i64> %mas define <8 x i64> @test_masked_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1621,7 +1621,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1671,7 +1671,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(ptr %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1685,7 +1685,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 17d6266ab7c9e..41e2aa003ce7a 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1294,8 +1294,8 @@ declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind define <32 x i16> @test_x86_avx512_psrlv_w_512_const() optsize { ; X86-LABEL: test_x86_avx512_psrlv_w_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X86-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x20,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1303,8 +1303,8 @@ define <32 x i16> @test_x86_avx512_psrlv_w_512_const() optsize { ; ; X64-LABEL: test_x86_avx512_psrlv_w_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X64-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x20,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -1579,8 +1579,8 @@ declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind r define <32 x i16> @test_x86_avx512_psllv_w_512_const() optsize { ; X86-LABEL: test_x86_avx512_psllv_w_512_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X86-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x20,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1588,8 +1588,8 @@ define <32 x i16> @test_x86_avx512_psllv_w_512_const() optsize { ; ; X64-LABEL: test_x86_avx512_psllv_w_512_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X64-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x20,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index d723fc6c05a29..f76b96eda7540 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2156,18 +2156,18 @@ define <8 x i16>@test_int_x86_avx512_maskz_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x define <8 x i16> @test_int_x86_avx512_psrlv_w_128_const() optsize { ; X86-LABEL: test_int_x86_avx512_psrlv_w_128_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psrlv_w_128_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -2180,18 +2180,18 @@ declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) define <16 x i16> @test_int_x86_avx512_psrlv_w_256_const() optsize { ; X86-LABEL: test_int_x86_avx512_psrlv_w_256_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbw {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psrlv_w_256_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbw {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -2400,18 +2400,18 @@ define <8 x i16>@test_int_x86_avx512_maskz_psllv8_hi(<8 x i16> %x0, <8 x i16> %x define <8 x i16> @test_int_x86_avx512_psllv_w_128_const() optsize { ; X86-LABEL: test_int_x86_avx512_psllv_w_128_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psllv_w_128_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,65535] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -2425,18 +2425,18 @@ declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) define <16 x i16> @test_int_x86_avx512_psllv_w_256_const() optsize { ; X86-LABEL: test_int_x86_avx512_psllv_w_256_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbw {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psllv_w_256_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbw {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index a570b392d2bea..c0bb0037923dc 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -7323,18 +7323,18 @@ define <8 x i32>@test_int_x86_avx512_maskz_psrav8_si(<8 x i32> %x0, <8 x i32> %x define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { ; X86-LABEL: test_int_x86_avx512_mask_psrav8_si_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav8_si_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -8636,18 +8636,18 @@ define <2 x i64>@test_int_x86_avx512_maskz_psrav_q_128(<2 x i64> %x0, <2 x i64> define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [2,0,4294967287,4294967295] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,18446744073709551607] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [2,18446744073709551607] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,18446744073709551607] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index adf7fa97b7765..a16659eab9763 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -28,7 +28,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -37,7 +37,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -67,7 +67,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -76,7 +76,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -221,7 +221,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -268,7 +268,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -442,10 +442,10 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -505,7 +505,7 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 50c132b6c34de..48abed8b6f222 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -30,7 +30,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -40,7 +40,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -81,7 +81,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -91,7 +91,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -279,7 +279,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 @@ -339,7 +339,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 @@ -569,11 +569,11 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm1 @@ -650,7 +650,7 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 50747d26c1c15..168bd1375a712 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -22,7 +22,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -32,7 +32,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -63,7 +63,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -73,7 +73,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 9125f2492ebb8..2e237fb5b07b7 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1046,7 +1046,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 @@ -1061,7 +1061,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; ; AVX2-LABEL: f32xi16_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1079,7 +1079,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-64-LABEL: f32xi16_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 @@ -1094,7 +1094,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; ; AVX2-64-LABEL: f32xi16_i256: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-64-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1500,7 +1500,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,0,3,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,3] ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,1,0,2,0,3,0] ; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 @@ -1515,7 +1515,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX2-LABEL: f8xi64_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1533,7 +1533,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-64-LABEL: f8xi64_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] +; AVX-64-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,3] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] ; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 @@ -1548,7 +1548,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX2-64-LABEL: f8xi64_i256: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] +; AVX2-64-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index 2d289017a89b8..55d72832e7de6 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -16,14 +16,14 @@ define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) { define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) { ; SSE-LABEL: combine_vec_add_constant_sub: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [0,2,4,6] ; SSE-NEXT: psubd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_add_constant_sub: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6] +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,2,4,6] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> , %a @@ -230,7 +230,7 @@ define void @PR52039(ptr %pa, ptr %pb) { ; SSE: # %bb.0: ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: movdqu 16(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [10,10,10,10] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm2 = [10,10,10,10] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psubd %xmm1, %xmm3 ; SSE-NEXT: psubd %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-addo.ll b/llvm/test/CodeGen/X86/combine-addo.ll index af51c04765224..ba748b6e653cf 100644 --- a/llvm/test/CodeGen/X86/combine-addo.ll +++ b/llvm/test/CodeGen/X86/combine-addo.ll @@ -77,7 +77,7 @@ define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: pmaxud %xmm2, %xmm0 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index d223b75419ac4..71335a7b58821 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -644,7 +644,7 @@ define <8 x i64> @neg_scalar_broadcast_v8i64(i64 %a0, <2 x i64> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 6c266be808eaf..637943023d38c 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -34,7 +34,7 @@ define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744069414584319,18446744060824649725] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967295,4294967294,4294967293,4294967292] ; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -77,7 +77,7 @@ define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, ptr nocapture readonly) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967294,4294967293,4294967292,4294967295] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -122,7 +122,7 @@ define <2 x i64> @bitselect_v2i64_mr(ptr nocapture readonly, <2 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [12884901890,4294967296] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,3,0,1] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -153,7 +153,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; XOP-LABEL: bitselect_v2i64_mm: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rsi), %xmm0 -; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967292,4294967295,4294967294,4294967293] ; XOP-NEXT: vpcmov %xmm1, (%rdi), %xmm0, %xmm0 ; XOP-NEXT: retq ; @@ -170,7 +170,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551612,18446744065119617022] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967292,4294967295,4294967294,4294967293] ; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -179,7 +179,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512VL-LABEL: bitselect_v2i64_mm: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967295,4294967294,4294967293] ; AVX512VL-NEXT: vpternlogq $202, (%rdi), %xmm1, %xmm0 ; AVX512VL-NEXT: retq %3 = load <2 x i64>, ptr %0 @@ -327,7 +327,7 @@ define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967295,4294967294,4294967293,4294967292,4294967293,4294967292,4294967293,4294967292] ; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -377,8 +377,7 @@ define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, ptr nocapture readonly) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967294,4294967293,4294967292,4294967295,4294967294,4294967293,4294967292,4294967295] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -430,8 +429,7 @@ define <4 x i64> @bitselect_v4i64_mr(ptr nocapture readonly, <4 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,3,0,1,2,3,0,1] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -484,8 +482,7 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967292,4294967295,4294967294,4294967293,4294967292,4294967295,4294967294,4294967293] ; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -493,8 +490,7 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512VL-LABEL: bitselect_v4i64_mm: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967292,4294967295,4294967294,4294967293,4294967292,4294967295,4294967294,4294967293] ; AVX512VL-NEXT: vpternlogq $202, (%rdi), %ymm1, %ymm0 ; AVX512VL-NEXT: retq %3 = load <4 x i64>, ptr %0 diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 9d7afb9478b17..c54a842227ad9 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -134,12 +134,12 @@ define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) { define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_negpow2c: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] +; SSE-NEXT: pmovsxbq {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] ; SSE-NEXT: pmuludq %xmm5, %xmm4 ; SSE-NEXT: paddq %xmm3, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 @@ -148,7 +148,7 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] +; SSE-NEXT: pmovsxbq {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] ; SSE-NEXT: pmuludq %xmm4, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -161,7 +161,7 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 ; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] +; AVX-NEXT: vpmovsxbq {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] ; AVX-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vpsllq $32, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll index 6ec95427f8755..0743592f3ca11 100644 --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -21,7 +21,7 @@ define <16 x i8> @combine_pavgb_self(<16 x i8> %a0) { define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { ; SSE-LABEL: combine_pavgw_knownbits: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; SSE-NEXT: pmovsxbw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pavgw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index e1d963ad1ec99..aa3bea2791416 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -104,7 +104,7 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] +; SSE-NEXT: pmovsxdq {{.*#+}} xmm4 = [715827883,715827883] ; SSE-NEXT: pmuludq %xmm4, %xmm0 ; SSE-NEXT: pmuludq %xmm4, %xmm1 ; SSE-NEXT: pmuludq %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index e10d94c16696a..1ace4a9837e5c 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -402,8 +402,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -422,7 +421,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -438,7 +437,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -468,8 +467,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; XOP-NEXT: # xmm2 = mem[0,0] +; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq %1 = sdiv <16 x i8> %x, @@ -639,7 +637,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psraw $15, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] ; SSE41-NEXT: pmulhuw %xmm3, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -662,7 +660,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2] +; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -718,10 +716,10 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,65522,65521,65524,65523,65525,65526,65521] +; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65522,65521,65524,65523,65525,65526,65521] ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,65534,65535,65532,65533,65531,65530,65535] +; XOP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535] ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 @@ -853,7 +851,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: psraw $15, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [u,4,2,16,8,32,64,2] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [0,4,2,16,8,32,64,2] ; SSE41-NEXT: pmulhuw %xmm5, %xmm6 ; SSE41-NEXT: paddw %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -894,7 +892,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [u,4,2,16,8,32,64,2] +; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,4,2,16,8,32,64,2] ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -995,10 +993,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,65522,65521,65524,65523,65525,65526,65521] +; XOP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,65522,65521,65524,65523,65525,65526,65521] ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,65534,65535,65532,65533,65531,65530,65535] +; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535] ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 @@ -1234,10 +1232,10 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4294967266,4294967267,4294967268] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967266,4294967267,4294967268] ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,4294967294,4294967293,4294967292] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 @@ -1510,10 +1508,10 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,4294967266,4294967267,4294967268] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,4294967266,4294967267,4294967268] ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4294967294,4294967293,4294967292] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292] ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 @@ -1718,7 +1716,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [u,2,3,4] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,3,4] ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 @@ -1737,8 +1735,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] -; XOP-NEXT: # xmm1 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 @@ -1904,10 +1901,10 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,62,61,60] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,62,61,60] ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,2,3,4] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,2,3,4] ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,2305843009213693952,1152921504606846976,576460752303423488] ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 @@ -1949,19 +1946,17 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] -; XOP-NEXT: # xmm3 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 -; XOP-NEXT: vmovddup {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] -; XOP-NEXT: # xmm7 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] @@ -2697,7 +2692,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform5: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 @@ -2810,7 +2805,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform6: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index a05da63e43e12..b485a9b10f26c 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -148,8 +148,7 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and: ; AVX-FAST-ALL: # %bb.0: -; AVX-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; AVX-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] ; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index db37db7ec1be5..cc0ed2b8268c6 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -178,8 +178,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 @@ -225,7 +224,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -297,7 +296,7 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll index 9ad211ac926eb..49ce2455ae8c7 100644 --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -296,7 +296,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) { ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3] ; AVX2-NEXT: vpsravd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: vpsllvd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 125196a0819b6..6807be4bf3481 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -175,8 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsrlq $48, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -230,8 +229,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) { ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -435,8 +433,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll index 8be82efbacd6f..13d5c9f185645 100644 --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -231,7 +231,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: combine_trunc_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -240,7 +240,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE42-LABEL: combine_trunc_v8i32_v8i16: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE42-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE42-NEXT: pminud %xmm3, %xmm2 ; SSE42-NEXT: pminud %xmm3, %xmm1 ; SSE42-NEXT: packusdw %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll index 5a823cb653b8e..9d5934c345f8a 100644 --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -104,14 +104,14 @@ define <4 x i32> @combine_vec_sub_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_vec_sub_constant_add(<4 x i32> %a) { ; SSE-LABEL: combine_vec_sub_constant_add: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] ; SSE-NEXT: psubd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sub_constant_add: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293] +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %a, diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index f94ff72274d63..04ea514b3e801 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -435,7 +435,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; XOP-LABEL: combine_vec_udiv_by_shl_pow2b: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] +; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -659,7 +659,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -673,7 +673,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; XOP-NEXT: movl $249, %eax ; XOP-NEXT: vmovd %eax, %xmm2 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq %div = udiv <16 x i8> %x, diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index d17ea107ba096..715d5c7b28f11 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -346,7 +346,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_shl_pow2b: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -362,7 +362,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2a: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,4,4,4] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -419,7 +419,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2b: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,8,16] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,4,8,16] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -442,7 +442,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,8,16] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 @@ -461,7 +461,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll index 350b4321e3b88..f26368c02de2b 100644 --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -93,7 +93,7 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -169,7 +169,7 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -201,7 +201,7 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -272,7 +272,7 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -348,7 +348,7 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -380,7 +380,7 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -451,7 +451,7 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -527,7 +527,7 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -559,7 +559,7 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -630,7 +630,7 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -706,7 +706,7 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -738,7 +738,7 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index d2310aba0e3f4..93dbe99882fe0 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -140,9 +140,9 @@ define <16 x i64> @load_catcat(ptr %p) { ; AVX512F-LABEL: load_catcat: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] ; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: retq %x = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 7dbeba96a34cb..2001fddfaac40 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -884,7 +884,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 @@ -904,7 +904,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; CHECK-NO-FASTFMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1 @@ -1070,8 +1070,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $56, %rsp ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,0,0,2,2,0,0] -; CHECK-AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax @@ -1101,8 +1100,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NO-FASTFMA-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,0,0,2,2,0,0] -; CHECK-NO-FASTFMA-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-NO-FASTFMA-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 933c546503048..c8708ea9b681f 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind { ; AVX2-NEXT: vcvttsd2si %xmm0, %rax ; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -576,8 +576,7 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper @@ -1060,8 +1059,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper @@ -2855,7 +2853,7 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind { ; AVX2-NEXT: vcvttsd2si %xmm0, %rax ; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -3228,8 +3226,7 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper @@ -3707,8 +3704,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll index 7f2fd4573de3b..dab5851657374 100644 --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -352,12 +352,10 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; ; X64-AVX-LABEL: clamp_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] -; X64-AVX-NEXT: # xmm1 = mem[0,0] +; X64-AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] ; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; X64-AVX-NEXT: # xmm1 = mem[0,0] +; X64-AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [255,255] ; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 7ad2bb712ae93..04365d60dfb30 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -120,7 +120,7 @@ define <4 x i1> @illegal_abs_to_eq_or(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: packssdw %xmm1, %xmm2 @@ -178,7 +178,7 @@ define <4 x i64> @illegal_abs_to_eq_or_sext(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -242,7 +242,7 @@ define <4 x i1> @illegal_abs_to_ne_and(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: packssdw %xmm1, %xmm2 @@ -306,7 +306,7 @@ define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm3, %xmm2 @@ -518,11 +518,11 @@ define <4 x i1> @eq_or_to_abs_vec4x64(<4 x i64> %x) { ; ; SSE41-LABEL: eq_or_to_abs_vec4x64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 @@ -574,11 +574,11 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; ; SSE41-LABEL: eq_or_to_abs_vec4x64_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 @@ -649,12 +649,12 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; ; SSE41-LABEL: ne_and_to_abs_vec4x64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 @@ -712,12 +712,12 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; ; SSE41-LABEL: ne_and_to_abs_vec4x64_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 @@ -944,7 +944,7 @@ define <4 x i1> @eq_or_to_abs_vec4x16(<4 x i16> %x) { ; ; SSE41-LABEL: eq_or_to_abs_vec4x16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [88,88,88,88,88,88,88,88] ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -1093,7 +1093,7 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [88,88,88,88,88,88,88,88] ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll index dd6553bec4b1d..dada1726be424 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -321,15 +321,25 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; SSE-LABEL: addand_ne_v8i16_fail: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE41-LABEL: addand_ne_v8i16_fail: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] +; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; SSE2-LABEL: addand_ne_v8i16_fail: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] +; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: retq %cmp1 = icmp ne <8 x i16> %x, %cmp2 = icmp ne <8 x i16> %x, %r = and <8 x i1> %cmp1, %cmp2 diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index bc977e006606e..f371ec10fe25f 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -53,27 +53,39 @@ define <16 x i8> @elt0_v16i8(i8 %x) { } define <8 x i16> @elt5_v8i16(i16 %x) { -; X86-SSE-LABEL: elt5_v8i16: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] -; X86-SSE-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: elt5_v8i16: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X86-SSE2-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: elt5_v8i16: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] -; X64-SSE-NEXT: pinsrw $5, %edi, %xmm0 -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: elt5_v8i16: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X64-SSE2-NEXT: pinsrw $5, %edi, %xmm0 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: elt5_v8i16: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] +; X86-SSE4-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: elt5_v8i16: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] +; X64-SSE4-NEXT: pinsrw $5, %edi, %xmm0 +; X64-SSE4-NEXT: retq ; ; X86-AVX-LABEL: elt5_v8i16: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X86-AVX-NEXT: vpmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] ; X86-AVX-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt5_v8i16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X64-AVX-NEXT: vpmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] ; X64-AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %ins = insertelement <8 x i16> , i16 %x, i32 5 @@ -99,25 +111,25 @@ define <4 x i32> @elt3_v4i32(i32 %x) { ; ; X86-SSE4-LABEL: elt3_v4i32: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,u] +; X86-SSE4-NEXT: pmovsxbd {{.*#+}} xmm0 = [42,1,2,0] ; X86-SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: elt3_v4i32: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,u] +; X64-SSE4-NEXT: pmovsxbd {{.*#+}} xmm0 = [42,1,2,0] ; X64-SSE4-NEXT: pinsrd $3, %edi, %xmm0 ; X64-SSE4-NEXT: retq ; ; X86-AVX-LABEL: elt3_v4i32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,u] +; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [42,1,2,0] ; X86-AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt3_v4i32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,u] +; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [42,1,2,0] ; X64-AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %ins = insertelement <4 x i32> , i32 %x, i32 3 @@ -140,7 +152,7 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; ; X64-SSE4-LABEL: elt0_v2i64: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [u,1] +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,1] ; X64-SSE4-NEXT: pinsrq $0, %rdi, %xmm0 ; X64-SSE4-NEXT: retq ; @@ -150,24 +162,11 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; -; X64-AVX1-LABEL: elt0_v2i64: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] -; X64-AVX1-NEXT: # xmm0 = mem[0,0] -; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: elt0_v2i64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; X64-AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq -; -; X64-AVX512F-LABEL: elt0_v2i64: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; X64-AVX512F-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX512F-NEXT: retq +; X64-AVX-LABEL: elt0_v2i64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ins = insertelement <2 x i64> , i64 %x, i32 0 ret <2 x i64> %ins } @@ -268,14 +267,14 @@ define <8 x i32> @elt7_v8i32(i32 %x) { ; ; X86-SSE4-LABEL: elt7_v8i32: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [4,5,6,u] +; X86-SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [4,5,6,0] ; X86-SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm1 ; X86-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: elt7_v8i32: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [4,5,6,u] +; X64-SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [4,5,6,0] ; X64-SSE4-NEXT: pinsrd $3, %edi, %xmm1 ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X64-SSE4-NEXT: retq @@ -394,7 +393,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; ; X64-SSE4-LABEL: elt5_v8i64: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4,u] +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [4,4] ; X64-SSE4-NEXT: pinsrq $1, %rdi, %xmm2 ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1] ; X64-SSE4-NEXT: movaps {{.*#+}} xmm1 = [2,3] @@ -429,7 +428,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; ; X64-AVX2-LABEL: elt5_v8i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4,u,6,7] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,0,6,7] ; X64-AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] @@ -448,8 +447,8 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X64-AVX512F-LABEL: elt5_v8i64: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovq %rdi, %xmm1 -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7] -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,1,2,3,4,u,6,7] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [42,1,2,3,4,0,6,7] ; X64-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64-AVX512F-NEXT: retq %ins = insertelement <8 x i64> , i64 %x, i32 5 diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll index 57ab9344c4fd8..e2defdc370543 100644 --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -47,7 +47,7 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind ; X86_AVX512-LABEL: insert_subvector_512: ; X86_AVX512: # %bb.0: ; X86_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X86_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0] +; X86_AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] ; X86_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X86_AVX512-NEXT: retl ; @@ -55,7 +55,7 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind ; X64_AVX512: # %bb.0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] +; X64_AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] ; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index e500801b69c4d..9648daf7427b1 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -62,13 +62,13 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X86-NEXT: vpsrlq $36, %xmm1, %xmm2 ; X86-NEXT: vpsrlq $35, %xmm1, %xmm1 ; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [268435456,0,134217728,0] +; X86-NEXT: vpmovsxdq {{.*#+}} xmm2 = [268435456,134217728] ; X86-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpsrlq $34, %xmm0, %xmm2 ; X86-NEXT: vpsrlq $33, %xmm0, %xmm0 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [1073741824,0,536870912,0] +; X86-NEXT: vpmovsxdq {{.*#+}} xmm2 = [1073741824,536870912] ; X86-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -81,13 +81,13 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpsrlq $35, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [268435456,134217728] +; X64-AVX1-NEXT: vpmovsxdq {{.*#+}} xmm2 = [268435456,134217728] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsrlq $34, %xmm0, %xmm2 ; X64-AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1073741824,536870912] +; X64-AVX1-NEXT: vpmovsxdq {{.*#+}} xmm2 = [1073741824,536870912] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -97,7 +97,7 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X64-AVX2-LABEL: signbits_ashr_sitofp_0: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1073741824,536870912,268435456,134217728] +; X64-AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [1073741824,536870912,268435456,134217728] ; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 60db806450ebc..89459a2d10177 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6429,7 +6429,7 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) { ; ; AVX2-LABEL: mload_constmask_v4i32: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: retq @@ -6962,7 +6962,7 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) { ; ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -7203,12 +7203,12 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; ; AVX2-LABEL: load_one_mask_bit_set6: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,18446744073709551615,0] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm4, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,18446744073709551615,0,0] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,18446744073709551615,0,0] ; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 185eb50435e8d..898b34e969b1d 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4758,9 +4758,9 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 ; ; AVX2-LABEL: mstore_constmask_allones_split: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm5, %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm4, %ymm0, (%rdi) ; AVX2-NEXT: vmovups %ymm7, 96(%rdi) ; AVX2-NEXT: vmovups %ymm6, 64(%rdi) @@ -4969,9 +4969,9 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; ; AVX2-LABEL: one_mask_bit_set6: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,18446744073709551615] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 64(%rdi) -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,18446744073709551615,0] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, 32(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 6c21bb4d99e74..c6e96d183f9d6 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -674,7 +674,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm6, %xmm6 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm7 = [255,255] ; SSE4-NEXT: pand %xmm7, %xmm3 ; SSE4-NEXT: pand %xmm7, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 @@ -2727,7 +2727,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm9 = [255,255,255,255] ; SSE4-NEXT: pand %xmm9, %xmm3 ; SSE4-NEXT: pand %xmm9, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 @@ -3720,7 +3720,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255] ; SSE4-NEXT: pand %xmm5, %xmm1 ; SSE4-NEXT: pand %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index bd1e6d320b69e..487f7298f442c 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -183,7 +183,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [2147483647,2147483647] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm10 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: movdqa %xmm10, %xmm8 @@ -199,7 +199,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 @@ -565,7 +565,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm9 = [32767,32767] ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: movdqa %xmm9, %xmm8 @@ -581,7 +581,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] ; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm10 @@ -665,8 +665,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -677,8 +676,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 @@ -1108,7 +1106,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [127,127] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm9 = [127,127] ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: movdqa %xmm9, %xmm8 @@ -1124,7 +1122,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm10 @@ -1209,8 +1207,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -1221,8 +1218,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 @@ -1572,7 +1568,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm5 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 @@ -1580,7 +1576,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1805,7 +1801,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [32767,32767] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm5 = [32767,32767] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 @@ -1813,7 +1809,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1858,15 +1854,13 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-LABEL: truncstore_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2122,7 +2116,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm5 = [127,127] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 @@ -2130,7 +2124,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -2176,15 +2170,13 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2397,11 +2389,11 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: pmovsxdq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -2559,11 +2551,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm4 = [32767,32767] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -2587,65 +2579,34 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v2i64_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskpd %xmm1, %eax -; AVX1-NEXT: xorl $3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB7_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB7_3 -; AVX1-NEXT: .LBB7_4: # %else2 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB7_1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB7_4 -; AVX1-NEXT: .LBB7_3: # %cond.store1 -; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v2i64_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [32767,32767] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskpd %xmm1, %eax -; AVX2-NEXT: xorl $3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB7_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB7_3 -; AVX2-NEXT: .LBB7_4: # %else2 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB7_1: # %cond.store -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB7_4 -; AVX2-NEXT: .LBB7_3: # %cond.store1 -; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v2i64_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB7_3 +; AVX-NEXT: .LBB7_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB7_1: # %cond.store +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB7_4 +; AVX-NEXT: .LBB7_3: # %cond.store1 +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2758,11 +2719,11 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [127,127] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm4 = [127,127] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -2785,63 +2746,33 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v2i64_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskpd %xmm1, %eax -; AVX1-NEXT: xorl $3, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB8_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB8_3 -; AVX1-NEXT: .LBB8_4: # %else2 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB8_1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB8_4 -; AVX1-NEXT: .LBB8_3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v2i64_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [127,127] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskpd %xmm1, %eax -; AVX2-NEXT: xorl $3, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB8_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB8_3 -; AVX2-NEXT: .LBB8_4: # %else2 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB8_1: # %cond.store -; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB8_4 -; AVX2-NEXT: .LBB8_3: # %cond.store1 -; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v2i64_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskpd %xmm1, %eax +; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB8_3 +; AVX-NEXT: .LBB8_4: # %else2 +; AVX-NEXT: retq +; AVX-NEXT: .LBB8_1: # %cond.store +; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB8_4 +; AVX-NEXT: .LBB8_3: # %cond.store1 +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 715df982e1a06..48a60622b4dea 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -2603,7 +2603,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535] +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [65535,0,65535,0,65535,0,65535,0] ; SSE4-NEXT: pminud %xmm8, %xmm1 ; SSE4-NEXT: pminud %xmm8, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -3293,7 +3293,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm9 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm9, %xmm1 ; SSE4-NEXT: pminud %xmm9, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -3937,7 +3937,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm5 = [65535,0,65535,0,65535,0,65535,0] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -4327,7 +4327,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index acf4d900745d3..d5c37225b93d5 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -960,7 +960,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -996,7 +996,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1170,7 +1170,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1206,7 +1206,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1360,7 +1360,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1397,7 +1397,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1550,7 +1550,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1587,7 +1587,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1743,7 +1743,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1781,7 +1781,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 4c605b10f66b6..cc08396ae8c78 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -399,8 +399,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -458,8 +457,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: # xmm8 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -571,8 +569,7 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 @@ -634,8 +631,7 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: # xmm8 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -745,8 +741,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -806,8 +801,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: # xmm8 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -917,8 +911,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -978,8 +971,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: # xmm8 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -1090,8 +1082,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm9 @@ -1153,8 +1144,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] -; XOP-NEXT: # xmm8 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm9 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 5cd0c232de44d..b28a3f821141a 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1144,7 +1144,7 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-lega ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x @@ -1199,7 +1199,7 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal- ; CHECK-LABEL: trunc_v16i32_v16i16_sign: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index f26bbb7e5c2bd..a7564c9622c5c 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1984,7 +1984,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -3185,7 +3185,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index de192aa038e5b..5da18ee6ad7c4 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1308,9 +1308,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -1327,12 +1327,12 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] ; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -1353,9 +1353,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll index 1b0af955ae824..384e40496d82a 100644 --- a/llvm/test/CodeGen/X86/packus.ll +++ b/llvm/test/CodeGen/X86/packus.ll @@ -131,7 +131,7 @@ define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) { ; SSE4-LABEL: trunc_lshr_v4i64_demandedelts: ; SSE4: # %bb.0: ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1] ; SSE4-NEXT: pand %xmm2, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE4-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll index 3a73ca1de11a1..81e0a09d0e3fa 100644 --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -993,12 +993,26 @@ define <16 x i16> @test27(<16 x i16> %x) { } define <16 x i16> @test28(<16 x i16> %x) { -; SSE-LABEL: test28: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE-NEXT: paddusw %xmm2, %xmm0 -; SSE-NEXT: paddusw %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test28: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE2-NEXT: paddusw %xmm2, %xmm0 +; SSE2-NEXT: paddusw %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test28: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSSE3-NEXT: paddusw %xmm2, %xmm0 +; SSSE3-NEXT: paddusw %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test28: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE41-NEXT: paddusw %xmm2, %xmm0 +; SSE41-NEXT: paddusw %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test28: ; AVX1: # %bb.0: @@ -1115,12 +1129,26 @@ define <16 x i16> @test29(<16 x i16> %x) { } define <16 x i16> @test30(<16 x i16> %x) { -; SSE-LABEL: test30: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] -; SSE-NEXT: paddusw %xmm2, %xmm0 -; SSE-NEXT: paddusw %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test30: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSE2-NEXT: paddusw %xmm2, %xmm0 +; SSE2-NEXT: paddusw %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test30: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSSE3-NEXT: paddusw %xmm2, %xmm0 +; SSSE3-NEXT: paddusw %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test30: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSE41-NEXT: paddusw %xmm2, %xmm0 +; SSE41-NEXT: paddusw %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test30: ; AVX1: # %bb.0: @@ -1294,14 +1322,32 @@ define <32 x i16> @test33(<32 x i16> %x) { } define <32 x i16> @test34(<32 x i16> %x) { -; SSE-LABEL: test34: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE-NEXT: paddusw %xmm4, %xmm0 -; SSE-NEXT: paddusw %xmm4, %xmm1 -; SSE-NEXT: paddusw %xmm4, %xmm2 -; SSE-NEXT: paddusw %xmm4, %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test34: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE2-NEXT: paddusw %xmm4, %xmm0 +; SSE2-NEXT: paddusw %xmm4, %xmm1 +; SSE2-NEXT: paddusw %xmm4, %xmm2 +; SSE2-NEXT: paddusw %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test34: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSSE3-NEXT: paddusw %xmm4, %xmm0 +; SSSE3-NEXT: paddusw %xmm4, %xmm1 +; SSSE3-NEXT: paddusw %xmm4, %xmm2 +; SSSE3-NEXT: paddusw %xmm4, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test34: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE41-NEXT: paddusw %xmm4, %xmm0 +; SSE41-NEXT: paddusw %xmm4, %xmm1 +; SSE41-NEXT: paddusw %xmm4, %xmm2 +; SSE41-NEXT: paddusw %xmm4, %xmm3 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test34: ; AVX1: # %bb.0: @@ -1478,14 +1524,32 @@ define <32 x i16> @test35(<32 x i16> %x) { } define <32 x i16> @test36(<32 x i16> %x) { -; SSE-LABEL: test36: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] -; SSE-NEXT: paddusw %xmm4, %xmm0 -; SSE-NEXT: paddusw %xmm4, %xmm1 -; SSE-NEXT: paddusw %xmm4, %xmm2 -; SSE-NEXT: paddusw %xmm4, %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test36: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] +; SSE2-NEXT: paddusw %xmm4, %xmm0 +; SSE2-NEXT: paddusw %xmm4, %xmm1 +; SSE2-NEXT: paddusw %xmm4, %xmm2 +; SSE2-NEXT: paddusw %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test36: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] +; SSSE3-NEXT: paddusw %xmm4, %xmm0 +; SSSE3-NEXT: paddusw %xmm4, %xmm1 +; SSSE3-NEXT: paddusw %xmm4, %xmm2 +; SSSE3-NEXT: paddusw %xmm4, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test36: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] +; SSE41-NEXT: paddusw %xmm4, %xmm0 +; SSE41-NEXT: paddusw %xmm4, %xmm1 +; SSE41-NEXT: paddusw %xmm4, %xmm2 +; SSE41-NEXT: paddusw %xmm4, %xmm3 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test36: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 8e6ae4b552657..5795b10cf4dfb 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -24,7 +24,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm3, %xmm0 @@ -109,20 +109,31 @@ entry: } define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { -; SSE-LABEL: mul_v2i64c: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: mul_v2i64c: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: mul_v2i64c: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [117,117] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v2i64c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [117,117] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -396,7 +407,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] ; SSE41-NEXT: pmullw %xmm4, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm0 @@ -452,12 +463,19 @@ entry: } define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { -; SSE-LABEL: mul_v16i16c: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] -; SSE-NEXT: pmullw %xmm2, %xmm0 -; SSE-NEXT: pmullw %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: mul_v16i16c: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: mul_v16i16c: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmullw %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v16i16c: ; AVX: # %bb.0: # %entry @@ -488,7 +506,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { ; ; SSE41-LABEL: mul_v8i32c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [117,117,117,117] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: retq @@ -504,22 +522,39 @@ entry: } define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { -; SSE-LABEL: mul_v4i64c: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: mul_v4i64c: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: paddq %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: mul_v4i64c: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [117,117] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: pmuludq %xmm2, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: paddq %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: pmuludq %xmm2, %xmm1 +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: paddq %xmm3, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64c: ; AVX: # %bb.0: # %entry @@ -764,7 +799,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] ; SSE41-NEXT: pmullw %xmm6, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm7, %xmm1 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 1110146d3cda8..c2a009f06b89d 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -329,7 +329,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767] ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll index 02669a6212d87..8843a0410a9f7 100644 --- a/llvm/test/CodeGen/X86/pr48215.ll +++ b/llvm/test/CodeGen/X86/pr48215.ll @@ -12,11 +12,11 @@ define i32 @PR48215(i32 %a0, i32 %a1) { ; AVX1-NEXT: idivl %esi ; AVX1-NEXT: vmovd %edx, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX1-NEXT: vmovd %eax, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,1,2,3] ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovmskps %ymm2, %ecx diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll index a6ae7ce5ccd15..57f52c8dcdbb0 100644 --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -7,7 +7,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vpbroadcastw (%rax), %xmm0 ; CHECK-NEXT: vmovdqu (%rax), %ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-NEXT: movzwl %ax, %eax diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll index 24f150727a184..1949841ea216b 100644 --- a/llvm/test/CodeGen/X86/pr61964.ll +++ b/llvm/test/CodeGen/X86/pr61964.ll @@ -30,7 +30,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; AVX2-LABEL: splitTransposeDecode_8_avx2: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] ; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 @@ -39,9 +39,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; ; AVX512VL-LABEL: splitTransposeDecode_8_avx2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm3 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: vmovdqa %ymm3, %ymm1 @@ -67,7 +67,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; XOPAVX2-LABEL: splitTransposeDecode_8_avx2: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] +; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] ; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll index a2e01d44f2bfb..d80d395c9d7b2 100644 --- a/llvm/test/CodeGen/X86/pr62014.ll +++ b/llvm/test/CodeGen/X86/pr62014.ll @@ -26,7 +26,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2 ; SSE42-NEXT: movapd %xmm0, %xmm2 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,2] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 @@ -38,7 +38,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,2] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -91,7 +91,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4 ; SSE42-NEXT: movaps %xmm0, %xmm2 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm1 @@ -103,7 +103,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 @@ -306,12 +306,12 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; SSE42-NEXT: pand %xmm5, %xmm6 ; SSE42-NEXT: pcmpeqw %xmm5, %xmm6 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm7 = [1,2,4,8] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm7, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm0 = [16,32,64,128] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -328,7 +328,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/pr63507.ll b/llvm/test/CodeGen/X86/pr63507.ll index 4016f1c3edb67..46f1038db19c6 100644 --- a/llvm/test/CodeGen/X86/pr63507.ll +++ b/llvm/test/CodeGen/X86/pr63507.ll @@ -4,7 +4,7 @@ define <4 x i32> @PR63507() { ; CHECK-LABEL: PR63507: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4294967295,4294967295] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0] ; CHECK-NEXT: vpmulld %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %psll.i = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll index 1c3b4bd4971c1..ceccee00c9457 100644 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ b/llvm/test/CodeGen/X86/pr74736.ll @@ -39,7 +39,7 @@ define void @main(<16 x i32> %0, i32 %1) { ; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] +; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] ; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll index 9c072e6f5e3fc..96f6a18819383 100644 --- a/llvm/test/CodeGen/X86/pr77459.ll +++ b/llvm/test/CodeGen/X86/pr77459.ll @@ -98,7 +98,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax @@ -157,7 +157,7 @@ define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512-NEXT: vpmovm2w %k0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovw2m %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 7f48f93bf7771..ffc83620d3dad 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -49,7 +49,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -65,7 +65,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0 ; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1 -; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX256VLBW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX256VLBW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX256VLBW-NEXT: vpmovw2m %ymm2, %k0 ; AVX256VLBW-NEXT: vpmovm2b %k0, %xmm0 @@ -80,7 +80,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VLBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VLBW-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512VLBW-NEXT: vpmovm2b %k0, %xmm0 @@ -95,7 +95,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -111,7 +111,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 @@ -181,7 +181,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512NOBW-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512NOBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512NOBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512NOBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512NOBW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 4f7f44d1d7c14..ca6d361224a8c 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -793,7 +793,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -903,7 +903,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm6 ; SSE41-NEXT: packsswb %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm5 = [255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: packusdw %xmm4, %xmm3 @@ -1047,7 +1047,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -1565,7 +1565,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_8i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -1871,7 +1871,7 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -1975,7 +1975,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; ; SSE41-LABEL: psubus_i16_i32_max_swapped: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2070,7 +2070,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_i16_i32_min: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2659,7 +2659,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: test32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2993,7 +2993,7 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-LABEL: test34: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index c34ee3a7b786a..f78b57d895ee1 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -693,7 +693,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm2, %xmm0 @@ -757,7 +757,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 483c16d6531b4..2187c653f76c3 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -106,10 +106,10 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax ; CHECK-AVX2-NEXT: movq 24(%rsi), %rcx ; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,1] ; CHECK-AVX2-NEXT: xorl %esi, %esi -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,2] +; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1] +; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [2,2] ; CHECK-AVX2-NEXT: .p2align 4, 0x90 ; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph ; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index de9d47526b166..b464cfa789043 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -228,7 +228,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind { ; AVX-LABEL: cmp_ult_load_const: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,214,0,255] +; AVX-NEXT: vpmovsxwd {{.*#+}} xmm1 = [42,214,0,255] ; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %loadx = load <4 x i8>, ptr %x @@ -292,7 +292,7 @@ define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind { ; AVX-LABEL: cmp_slt_load_const: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,4294967254,0,4294967295] +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [42,4294967254,0,4294967295] ; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %loadx = load <4 x i8>, ptr %x diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 489ee1c9c5bea..e94f51233256c 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -65,7 +65,7 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] ; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 05dd2344d30f7..2d7e6f6e58bee 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -810,8 +810,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -882,8 +881,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1307,7 +1305,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1315,8 +1313,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX2-LABEL: negative: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1326,8 +1323,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512F-LABEL: negative: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1346,8 +1342,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512BW-LABEL: negative: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 85e160e497172..a481aaef4257d 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -34,7 +34,7 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 913ae195c3631..b8d3527070b87 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -101,7 +101,7 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) { define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLM-LABEL: test_mul_v8i32_v8i8: ; SLM: # %bb.0: -; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -114,7 +114,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLOW-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW-NEXT: ret{{[l|q]}} @@ -124,7 +124,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-NEXT: ret{{[l|q]}} @@ -199,7 +199,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM-LABEL: test_mul_v16i32_v16i8: ; SLM: # %bb.0: ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -221,7 +221,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] ; SLOW-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW-NEXT: pmaddwd %xmm4, %xmm2 @@ -237,7 +237,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] ; SSE4-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-NEXT: pmaddwd %xmm4, %xmm2 @@ -399,7 +399,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm1, %xmm2 ; SSE4-NEXT: pmulld %xmm0, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 @@ -480,7 +480,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm3, %xmm0 ; SSE4-NEXT: pmulld %xmm3, %xmm4 ; SSE4-NEXT: pmulld %xmm3, %xmm2 @@ -621,7 +621,7 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize { define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SLM-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM: # %bb.0: -; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -634,7 +634,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SLOW-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW-NEXT: ret{{[l|q]}} @@ -644,7 +644,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] ; SSE4-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-NEXT: ret{{[l|q]}} @@ -719,7 +719,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLM-LABEL: test_mul_v16i32_v16i8_minsize: ; SLM: # %bb.0: ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778] ; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -741,7 +741,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] ; SLOW-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW-NEXT: pmaddwd %xmm4, %xmm2 @@ -757,7 +757,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] ; SSE4-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-NEXT: pmaddwd %xmm4, %xmm2 @@ -861,7 +861,7 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize { define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLM-LABEL: test_mul_v8i32_v8i16_minsize: ; SLM: # %bb.0: -; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM-NEXT: pxor %xmm3, %xmm3 ; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] @@ -875,7 +875,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SLOW-NEXT: pxor %xmm1, %xmm1 ; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; SLOW-NEXT: pmulld %xmm1, %xmm2 ; SLOW-NEXT: pmulld %xmm0, %xmm1 ; SLOW-NEXT: movdqa %xmm2, %xmm0 @@ -886,7 +886,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm1, %xmm2 ; SSE4-NEXT: pmulld %xmm0, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 @@ -906,7 +906,7 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM: # %bb.0: -; SLM-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778] +; SLM-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] ; SLM-NEXT: movdqa %xmm0, %xmm4 ; SLM-NEXT: pxor %xmm5, %xmm5 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -928,7 +928,7 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SLOW-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SLOW-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778] +; SLOW-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] ; SLOW-NEXT: pmulld %xmm3, %xmm0 ; SLOW-NEXT: pmulld %xmm3, %xmm4 ; SLOW-NEXT: pmulld %xmm3, %xmm2 @@ -944,7 +944,7 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778] +; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm3, %xmm0 ; SSE4-NEXT: pmulld %xmm3, %xmm4 ; SSE4-NEXT: pmulld %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index 565946d342e93..fdb2f41ec0e49 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -250,7 +250,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: subq %rax, %rdi ; SSE41-NEXT: movq %rdi, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967295,1,4294967295,1] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE41-NEXT: movq %rcx, %rax diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 56d1fef5f6d6d..9f610d95c7e3b 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1357,7 +1357,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index c8c1026bdaf3f..c8de34f63dd85 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -148,7 +148,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $6, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/sse-domains.ll b/llvm/test/CodeGen/X86/sse-domains.ll index 186c89a75cd3a..116efa69bb319 100644 --- a/llvm/test/CodeGen/X86/sse-domains.ll +++ b/llvm/test/CodeGen/X86/sse-domains.ll @@ -17,7 +17,7 @@ define void @f(ptr nocapture %p, i32 %n) nounwind uwtable ssp { ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: addq $16, %rdi ; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [127,127,127,127] +; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [127,127,127,127] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %while.body ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 3aec8330596e1..056c6404f5cfa 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -806,7 +806,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; X86-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] +; X86-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 @@ -827,7 +827,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] +; X86-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 @@ -856,7 +856,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; X64-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] +; X64-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 @@ -877,7 +877,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index f677e6737987e..b4e91da920a2f 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -168,7 +168,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE41-NEXT: pinsrd $2, %edx, %xmm0 ; SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047] ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrld $1, %xmm2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 484ae2bf95602..f49b7ae1b229c 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -483,7 +483,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -915,7 +915,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,2,u] +; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1145,7 +1145,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1797,7 +1797,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index 12c1fe9187226..2166e43fc4286 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -564,7 +564,7 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index 5243c6dd4e600..6d99bedd40b91 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -155,7 +155,7 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-SSE41-LABEL: t2_narrow: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535] +; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744073709507925,18446744073709507925] ; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll index 43f73a19d635e..3873f04b8307e 100644 --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -115,7 +115,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] ; SSE-NEXT: pmulhuw %xmm0, %xmm1 ; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -139,7 +139,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63] +; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: andl $31, %eax diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 8823b98c4ff8a..73e90fe77bca2 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -1093,22 +1093,56 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { } define void @PR48223(ptr %p0) { -; SSE-LABEL: PR48223: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSE-NEXT: psubusw %xmm4, %xmm1 -; SSE-NEXT: psubusw %xmm4, %xmm0 -; SSE-NEXT: psubusw %xmm4, %xmm3 -; SSE-NEXT: psubusw %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rdi) -; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: PR48223: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] +; SSE2-NEXT: psubusw %xmm4, %xmm1 +; SSE2-NEXT: psubusw %xmm4, %xmm0 +; SSE2-NEXT: psubusw %xmm4, %xmm3 +; SSE2-NEXT: psubusw %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) +; SSE2-NEXT: movdqa %xmm3, 48(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm1, 16(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR48223: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm1 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] +; SSSE3-NEXT: psubusw %xmm4, %xmm1 +; SSSE3-NEXT: psubusw %xmm4, %xmm0 +; SSSE3-NEXT: psubusw %xmm4, %xmm3 +; SSSE3-NEXT: psubusw %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR48223: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] +; SSE41-NEXT: psubusw %xmm4, %xmm1 +; SSE41-NEXT: psubusw %xmm4, %xmm0 +; SSE41-NEXT: psubusw %xmm4, %xmm3 +; SSE41-NEXT: psubusw %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, 32(%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm1, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX1-LABEL: PR48223: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 6c07c4ca523f8..fef5d22251172 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -1215,8 +1215,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [17,51,85,119] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,51,85,119] +; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm0 = [34,68,102,136] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: movq %rbp, %rsp @@ -1238,7 +1238,7 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136] +; AVX512VL-NEXT: vpmovsxwq {{.*#+}} ymm0 = [34,68,102,136] ; AVX512VL-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} ; AVX512VL-NEXT: movq %rbp, %rsp ; AVX512VL-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 7579c89ec4389..7bbcdee9a6802 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1907,7 +1907,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_2i64_to_4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -2022,7 +2022,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_2i64_to_2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -2136,7 +2136,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_4i64_to_4f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -2531,7 +2531,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: psrlq $1, %xmm3 @@ -4329,7 +4329,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm3 @@ -4735,7 +4735,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE41-NEXT: movdqa 16(%rdi), %xmm5 ; SSE41-NEXT: movdqa 32(%rdi), %xmm6 ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm7 = [1,1] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 @@ -5625,7 +5625,7 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; ; SSE41-LABEL: PR43609: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [2,2] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll index 531efa2476f68..1fc4c943abc65 100644 --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -32,7 +32,7 @@ define void @loop_no_const_reload(ptr %in, ptr %out, i32 %n) { ; SSE41-NEXT: je LBB0_3 ; SSE41-NEXT: ## %bb.1: ## %for.body.preheader ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: LBB0_2: ## %for.body ; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -100,7 +100,7 @@ define void @loop_const_folding_underflow(ptr %in, ptr %out, i32 %n) { ; SSE41-NEXT: je LBB1_3 ; SSE41-NEXT: ## %bb.1: ## %for.body.preheader ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26] ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: LBB1_2: ## %for.body @@ -219,7 +219,7 @@ define <4 x i1> @ugt_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -291,7 +291,7 @@ define <4 x i1> @uge_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: uge_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -364,7 +364,7 @@ define <4 x i1> @ult_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ult_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -434,7 +434,7 @@ define <4 x i1> @ule_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ule_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -501,7 +501,7 @@ define <4 x i1> @ugt_v4i32_nonsplat(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_nonsplat: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -520,7 +520,7 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_splat_commute: ; SSE41: ## %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3,3,3,3] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,3,3,3] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -544,7 +544,7 @@ define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) { ; SSE41-LABEL: PR39859: ; SSE41: ## %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43] ; SSE41-NEXT: pmaxuw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll index 87e29261eaa49..657b71df9fae0 100644 --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -163,16 +163,27 @@ define <16 x i8> @or_icmp_eq_const_1bit_diff(<16 x i8> %x) { } define <4 x i32> @or_icmp_ne_const_1bit_diff(<4 x i32> %x) { -; SSE-LABEL: or_icmp_ne_const_1bit_diff: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60] -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: or_icmp_ne_const_1bit_diff: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: or_icmp_ne_const_1bit_diff: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,44,60] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: or_icmp_ne_const_1bit_diff: ; AVX: # %bb.0: @@ -215,16 +226,27 @@ define <16 x i8> @and_icmp_eq_const_1bit_diff(<16 x i8> %x) { } define <4 x i32> @and_icmp_ne_const_1bit_diff(<4 x i32> %x) { -; SSE-LABEL: and_icmp_ne_const_1bit_diff: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44] -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: and_icmp_ne_const_1bit_diff: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: and_icmp_ne_const_1bit_diff: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,54,44] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: and_icmp_ne_const_1bit_diff: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index ce17e155c808e..a905f881742bd 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -133,7 +133,7 @@ define <8 x i32> @test6(<8 x i32> %a) { ; ; SSE41-LABEL: test6: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: retq @@ -218,7 +218,7 @@ define <16 x i32> @test8(<16 x i32> %a) { ; ; SSE41-LABEL: test8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8] ; SSE41-NEXT: pmulld %xmm4, %xmm0 ; SSE41-NEXT: pmulld %xmm4, %xmm1 ; SSE41-NEXT: pmulld %xmm4, %xmm2 @@ -274,7 +274,7 @@ define <8 x i64> @test9(<8 x i64> %a) { ; ; AVX2-LABEL: test9: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3] ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index b275814cc8033..f54f94272f059 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -270,7 +270,7 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 @@ -401,7 +401,7 @@ define <4 x i32> @smulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 @@ -672,7 +672,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 @@ -871,7 +871,7 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 @@ -1229,7 +1229,7 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpmuldq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0 @@ -3191,7 +3191,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index e929499c92cbd..bafe025237f65 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -226,7 +226,7 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 @@ -342,7 +342,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 @@ -562,7 +562,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 @@ -732,7 +732,7 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 @@ -1024,7 +1024,7 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 @@ -2820,7 +2820,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 6d71564dd57f9..72fd9beab81ab 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -1306,14 +1306,14 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1327,7 +1327,7 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] @@ -1407,26 +1407,26 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1442,12 +1442,12 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 @@ -1467,28 +1467,45 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n } define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { -; SSE-LABEL: fsub_v8f32_cast_cond: -; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: subps %xmm6, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: fsub_v8f32_cast_cond: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: subps %xmm4, %xmm0 +; SSE2-NEXT: subps %xmm6, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: fsub_v8f32_cast_cond: +; SSE42: # %bb.0: +; SSE42-NEXT: movd %edi, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE42-NEXT: pand %xmm3, %xmm6 +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm4 +; SSE42-NEXT: subps %xmm4, %xmm0 +; SSE42-NEXT: subps %xmm6, %xmm1 +; SSE42-NEXT: retq ; ; AVX2-LABEL: fsub_v8f32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -1556,22 +1573,22 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -1585,11 +1602,11 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -1636,14 +1653,14 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1657,7 +1674,7 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] @@ -1737,26 +1754,26 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1772,12 +1789,12 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 @@ -1824,14 +1841,14 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1845,7 +1862,7 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] @@ -1925,26 +1942,26 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm11, %xmm10 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1960,12 +1977,12 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 @@ -2089,28 +2106,45 @@ define <8 x i32> @add_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> } define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { -; SSE-LABEL: add_v8i32_cast_cond: -; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: paddd %xmm6, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: add_v8i32_cast_cond: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: add_v8i32_cast_cond: +; SSE42: # %bb.0: +; SSE42-NEXT: movd %edi, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE42-NEXT: pand %xmm3, %xmm6 +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm0 +; SSE42-NEXT: paddd %xmm6, %xmm1 +; SSE42-NEXT: retq ; ; AVX2-LABEL: add_v8i32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -2178,22 +2212,22 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -2207,11 +2241,11 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -2525,28 +2559,45 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, } define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { -; SSE-LABEL: sub_v8i32_cast_cond: -; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: psubd %xmm4, %xmm0 -; SSE-NEXT: psubd %xmm6, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sub_v8i32_cast_cond: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: psubd %xmm4, %xmm0 +; SSE2-NEXT: psubd %xmm6, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: sub_v8i32_cast_cond: +; SSE42: # %bb.0: +; SSE42-NEXT: movd %edi, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa %xmm4, %xmm6 +; SSE42-NEXT: pand %xmm5, %xmm6 +; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE42-NEXT: pand %xmm3, %xmm6 +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE42-NEXT: pand %xmm2, %xmm4 +; SSE42-NEXT: psubd %xmm4, %xmm0 +; SSE42-NEXT: psubd %xmm6, %xmm1 +; SSE42-NEXT: retq ; ; AVX2-LABEL: sub_v8i32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -2614,22 +2665,22 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -2643,11 +2694,11 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -2848,14 +2899,14 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -2869,7 +2920,7 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] @@ -2985,26 +3036,26 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -3056,12 +3107,12 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 @@ -3564,12 +3615,12 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm5 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm4, %xmm6 ; SSE42-NEXT: pand %xmm5, %xmm6 ; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE42-NEXT: pand %xmm3, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE42-NEXT: pand %xmm2, %xmm4 @@ -3588,7 +3639,7 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -3672,22 +3723,22 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 ; SSE42-NEXT: pand %xmm5, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -3717,11 +3768,11 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -4394,12 +4445,12 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm4 ; SSE42-NEXT: pand %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE42-NEXT: pand %xmm2, %xmm5 @@ -4439,7 +4490,7 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -4523,22 +4574,22 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 ; SSE42-NEXT: pand %xmm5, %xmm10 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -4568,11 +4619,11 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -5245,12 +5296,12 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: pmovsxwd {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm4 ; SSE42-NEXT: pand %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE42-NEXT: pand %xmm2, %xmm5 @@ -5290,7 +5341,7 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -5402,22 +5453,22 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: pmovsxwq {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [4,8] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm6 ; SSE42-NEXT: pand %xmm10, %xmm6 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm6 ; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -5475,11 +5526,11 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 550b2e0655438..1addedf3c3d96 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -49,7 +49,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: psrlq $1, %xmm1 @@ -69,8 +69,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 @@ -87,7 +86,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -98,7 +97,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -120,7 +119,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -157,8 +156,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -171,7 +169,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -241,7 +239,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -970,44 +968,44 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: psrlq $1, %xmm1 -; SSE-NEXT: psrlq %xmm4, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: psllq %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: psrlq %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllq %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: splatvar_funnnel_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq +; SSE41-LABEL: splatvar_funnnel_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pandn %xmm3, %xmm4 +; SSE41-NEXT: psrlq $1, %xmm1 +; SSE41-NEXT: psrlq %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: psllq %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; -; AVX2-LABEL: splatvar_funnnel_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] -; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_funnnel_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1029,7 +1027,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1065,28 +1063,16 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_funnnel_v2i64: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] -; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_funnnel_v2i64: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] -; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_funnnel_v2i64: +; XOP: # %bb.0: +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1210,21 +1196,33 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % } define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: psrlw %xmm4, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: psllw %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: psrlw %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psllw %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pandn %xmm3, %xmm4 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: psrlw %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: psllw %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1235,7 +1233,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1246,7 +1244,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1257,7 +1255,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1278,7 +1276,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1295,7 +1293,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1762,7 +1760,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1893,7 +1891,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1985,9 +1983,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1998,7 +1996,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2085,7 +2083,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 @@ -2103,7 +2101,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 683fdf15cdea4..41209111dc47b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -760,8 +760,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 @@ -779,7 +778,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -790,7 +789,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -812,7 +811,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -849,8 +848,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 @@ -868,7 +866,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1000,7 +998,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1088,7 +1086,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1278,7 +1276,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX1-NEXT: vmovd %ecx, %xmm2 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [31,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [31,0] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 @@ -1614,7 +1612,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,14,50,60] ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1712,7 +1710,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1802,9 +1800,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1814,7 +1812,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 558e77ede0186..e23855361e57a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -426,7 +426,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -448,7 +448,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 6a8d9d73f138b..b2b93ae43f365 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -48,7 +48,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm1 @@ -69,8 +69,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] @@ -88,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -725,43 +724,43 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psubq %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psllq %xmm1, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: psrlq %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllq %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: psrlq %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: splatvar_funnnel_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: retq +; SSE41-LABEL: splatvar_funnnel_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: psubq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psllq %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: psrlq %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: retq ; -; AVX2-LABEL: splatvar_funnnel_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_funnnel_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -945,7 +944,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -958,7 +957,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -969,7 +968,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -980,7 +979,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -991,7 +990,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1002,7 +1001,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1238,7 +1237,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1252,7 +1251,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1266,7 +1265,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1348,7 +1347,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1362,7 +1361,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1376,7 +1375,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1446,9 +1445,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1464,7 +1463,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1540,11 +1539,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1565,11 +1564,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; ; AVX512VBMI2-LABEL: constant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 6fc95cc7780ff..60e281a82d5cd 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -570,8 +570,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 @@ -588,7 +587,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -757,7 +756,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 @@ -1028,7 +1027,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1041,7 +1040,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1054,7 +1053,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1116,7 +1115,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1129,7 +1128,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1142,7 +1141,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1213,9 +1212,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1230,7 +1229,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 3452b33ada2a9..8523cb4973827 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index 64deaf0e75966..d8e45ed9151d8 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -59,7 +59,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -390,7 +390,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 70e3a02516743..b78bcd40708e5 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -49,7 +49,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 @@ -69,8 +69,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -87,7 +86,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -98,7 +97,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -120,7 +119,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -158,8 +157,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4 @@ -172,7 +170,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -242,7 +240,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -486,7 +484,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE41-LABEL: var_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -1091,44 +1089,44 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: psrlq %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm0 -; SSE-NEXT: psllq %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psrlq %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: psllq %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: splatvar_funnnel_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq +; SSE41-LABEL: splatvar_funnnel_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: psrlq %xmm4, %xmm1 +; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: psllq %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; -; AVX2-LABEL: splatvar_funnnel_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_funnnel_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1150,7 +1148,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1187,28 +1185,16 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_funnnel_v2i64: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_funnnel_v2i64: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] -; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_funnnel_v2i64: +; XOP: # %bb.0: +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1335,21 +1321,33 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % } define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: psrlw %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: psllw %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psrlw %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: paddw %xmm0, %xmm0 +; SSE2-NEXT: psllw %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_funnnel_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: psrlw %xmm4, %xmm1 +; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: psllw %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1360,7 +1358,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1371,7 +1369,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1382,7 +1380,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1403,7 +1401,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1421,7 +1419,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1665,7 +1663,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1797,7 +1795,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1906,9 +1904,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1919,7 +1917,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2073,7 +2071,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 @@ -2090,7 +2088,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 61aea6ad4d595..c6f97f65a6241 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -790,8 +790,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 @@ -809,7 +808,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -820,7 +819,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -842,7 +841,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -880,8 +879,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 @@ -899,7 +897,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1032,7 +1030,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 @@ -1121,7 +1119,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 @@ -1364,7 +1362,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,14,50,60] ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1463,7 +1461,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1554,9 +1552,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1566,7 +1564,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 29c258b857eec..91dd83050e17e 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -424,7 +424,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -446,7 +446,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 3fa9994312e45..dbc33f8ad1490 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -48,7 +48,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm1 @@ -69,8 +69,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] @@ -88,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -752,43 +751,43 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { -; SSE-LABEL: splatvar_funnnel_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psubq %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psrlq %xmm1, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: psllq %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_funnnel_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlq %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: psllq %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: splatvar_funnnel_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: retq +; SSE41-LABEL: splatvar_funnnel_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: psubq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrlq %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: psllq %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: retq ; -; AVX2-LABEL: splatvar_funnnel_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_funnnel_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -982,7 +981,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -995,7 +994,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1006,7 +1005,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1017,7 +1016,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1028,7 +1027,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1039,7 +1038,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1302,7 +1301,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1316,7 +1315,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1330,7 +1329,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1412,7 +1411,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1426,7 +1425,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1440,7 +1439,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1510,9 +1509,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1528,7 +1527,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1604,11 +1603,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1629,11 +1628,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; ; AVX512VBMI2-LABEL: constant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index b2047a04f163e..6f8c893ac4b38 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -601,8 +601,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlq %xmm3, %xmm4, %xmm5 @@ -619,7 +618,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -796,7 +795,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5 @@ -1079,7 +1078,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1092,7 +1091,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1105,7 +1104,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1167,7 +1166,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1180,7 +1179,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1193,7 +1192,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1264,9 +1263,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1281,7 +1280,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index d78aa4e049e0a..d4874ad2cbd78 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 1add344e3e41f..0426c48aecfcf 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -59,7 +59,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -454,7 +454,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] +; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 34c584e8eb7ad..52185e73b56dd 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -334,7 +334,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_divconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -810,7 +810,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 1b55a401f401d..7477029e2d7a5 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -88,7 +88,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ; AVX-NEXT: vpsrld $31, %zmm0, %zmm1 @@ -394,7 +394,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1 ; AVX-NEXT: vpsrld $31, %zmm1, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 1ce21cb39b2e8..2477fb704db63 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -190,7 +190,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] ; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -400,7 +400,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_divconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -413,7 +413,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -665,7 +665,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] ; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -916,7 +916,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -929,7 +929,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index 3b037829c54a0..b21c50d91447b 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -89,7 +89,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0 ; AVX-NEXT: vpsrld $1, %zmm0, %zmm0 @@ -414,7 +414,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1 ; AVX-NEXT: vpsrld $1, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 9e9325558804e..3bc97f71f04fb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -501,7 +501,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [84148480,218892552,353636624,488380696] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] @@ -519,7 +519,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [84148480,218892552,353636624,488380696] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] @@ -759,7 +759,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm4 = [84148480,218892552,353636624,488380696] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] @@ -791,7 +791,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm4 = [84148480,218892552,353636624,488380696] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] @@ -873,9 +873,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -886,9 +886,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -899,9 +899,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -912,9 +912,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -1245,7 +1245,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [84148480,218892552,353636624,488380696] ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] @@ -1301,7 +1301,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [84148480,218892552,353636624,488380696] ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] @@ -1441,11 +1441,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1461,11 +1461,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1481,11 +1481,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1501,11 +1501,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 4af7979f12f90..1436922f9dd11 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -630,13 +630,13 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride3_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -646,13 +646,13 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -662,13 +662,13 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -678,13 +678,13 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -855,7 +855,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] @@ -868,7 +868,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] @@ -879,7 +879,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -899,7 +899,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] @@ -912,7 +912,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] @@ -923,7 +923,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -943,7 +943,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] @@ -956,7 +956,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] @@ -967,7 +967,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -1163,11 +1163,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1179,11 +1179,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1195,11 +1195,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1211,11 +1211,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1580,7 +1580,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] @@ -1607,7 +1607,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] @@ -1630,7 +1630,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] @@ -1664,7 +1664,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] @@ -1691,7 +1691,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] @@ -1714,7 +1714,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] @@ -1748,7 +1748,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] @@ -1775,7 +1775,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] @@ -1798,7 +1798,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] @@ -2151,17 +2151,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2174,17 +2174,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2197,17 +2197,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2220,17 +2220,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2989,17 +2989,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX2-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 ; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 @@ -3162,17 +3162,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 @@ -3335,17 +3335,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 @@ -4193,23 +4193,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4230,23 +4230,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4267,23 +4267,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4304,23 +4304,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 084ec8e1d8d9e..3f77e50260c8d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1233,7 +1233,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 @@ -1261,7 +1261,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 @@ -1369,7 +1369,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4 @@ -1377,7 +1377,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9 @@ -1390,7 +1390,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 @@ -1484,7 +1484,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4 @@ -1492,7 +1492,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9 @@ -1505,7 +1505,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 @@ -1529,15 +1529,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-LABEL: load_i16_stride4_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1548,15 +1548,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1567,15 +1567,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1586,15 +1586,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -2578,7 +2578,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm10 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 @@ -2642,7 +2642,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] @@ -2857,7 +2857,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm7 @@ -2865,7 +2865,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] @@ -2894,7 +2894,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm13 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm8 @@ -3069,7 +3069,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm7 @@ -3077,7 +3077,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] @@ -3106,7 +3106,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm13 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm8 @@ -5365,7 +5365,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] @@ -5506,7 +5506,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload @@ -5982,7 +5982,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm24 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm1, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 @@ -5990,7 +5990,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm1, %ymm11 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] @@ -6051,7 +6051,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm15, %ymm8 @@ -6406,7 +6406,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm1, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 @@ -6414,7 +6414,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm1, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] @@ -6475,7 +6475,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm15, %ymm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 94fa5c12e1c42..8e55cb48cf7a2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1188,15 +1188,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1210,15 +1210,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1232,15 +1232,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1254,15 +1254,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1640,7 +1640,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm6 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4 @@ -1729,7 +1729,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5 @@ -1814,13 +1814,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 @@ -1831,12 +1830,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,u,u,u,4,7,1,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,4,7,1,6] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 @@ -1847,7 +1845,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -1862,7 +1860,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -1877,12 +1875,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,u,u,6,0,3,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,2,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1989,7 +1986,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,u,u,u,4,6,1,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] @@ -1997,8 +1994,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] @@ -2006,7 +2002,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,u,u,4,7,1,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] @@ -2014,8 +2010,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 @@ -2026,7 +2021,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -2041,7 +2036,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -2056,15 +2051,14 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx) @@ -2169,7 +2163,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,u,u,u,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] @@ -2177,8 +2171,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] @@ -2186,7 +2179,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,u,u,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] @@ -2194,8 +2187,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 @@ -2206,7 +2198,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -2221,7 +2213,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] @@ -2236,15 +2228,14 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx) @@ -2258,26 +2249,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -2291,26 +2282,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -2324,26 +2315,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -2357,26 +2348,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -3246,7 +3237,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] ; AVX2-NEXT: vmovdqa %ymm12, %ymm15 @@ -3469,7 +3460,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] @@ -3679,7 +3670,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 @@ -3690,7 +3681,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm0 @@ -3705,7 +3696,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,u,u,4,7,1,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,4,7,1,6] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 @@ -3725,7 +3716,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm9 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] @@ -3739,7 +3730,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm11 @@ -3760,7 +3751,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,2,u,u,5,7,2,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 @@ -3795,7 +3786,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 @@ -3831,7 +3822,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,u,u,6,0,3,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] @@ -3839,7 +3830,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 @@ -4082,10 +4073,10 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,4,7,1,4,6,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,3,2,4,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 @@ -4100,7 +4091,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,u,u,u,4,6,1,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] @@ -4108,7 +4099,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,1,u,0,3,5,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] @@ -4122,11 +4113,11 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] ; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,2,u,1,3,6,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 @@ -4135,7 +4126,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,5,7,4,7,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] @@ -4155,7 +4146,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,3,5,2,5,7,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] @@ -4166,7 +4157,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] ; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] @@ -4181,14 +4172,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,u,u,5,0,2,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] @@ -4213,19 +4204,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,u,u,6,0,3,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,4,6,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 @@ -4454,10 +4445,10 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,4,7,1,4,6,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,3,2,4,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 @@ -4472,7 +4463,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,u,u,u,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] @@ -4480,7 +4471,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,1,u,0,3,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] @@ -4494,11 +4485,11 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,2,u,1,3,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 @@ -4507,7 +4498,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,5,7,4,7,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] @@ -4527,7 +4518,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,3,5,2,5,7,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] @@ -4538,7 +4529,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] ; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] @@ -4553,14 +4544,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,u,u,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] @@ -4585,19 +4576,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,u,u,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,4,6,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 @@ -4631,40 +4622,40 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -4672,7 +4663,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -4692,40 +4683,40 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -4733,7 +4724,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -4753,40 +4744,40 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -4794,7 +4785,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -4814,40 +4805,40 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -4855,7 +4846,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -6580,7 +6571,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] @@ -6634,7 +6625,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload @@ -7068,7 +7059,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] @@ -7550,11 +7541,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] @@ -7614,7 +7605,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,u,u,u,4,7,1,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 @@ -7647,7 +7638,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm11 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] @@ -7682,7 +7673,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm11 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7720,7 +7711,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,u,u,5,7,2,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 @@ -7793,7 +7784,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,u,u,5,0,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6 @@ -7861,13 +7852,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,u,u,6,0,3,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -8464,7 +8455,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 @@ -8473,17 +8464,17 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,4,7,1,4,6,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 @@ -8543,7 +8534,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 @@ -8556,7 +8547,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,2,5,7,4,7,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] ; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 @@ -8574,7 +8565,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 @@ -8627,7 +8618,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,3,5,2,5,7,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 @@ -8645,7 +8636,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] @@ -8704,7 +8695,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] @@ -8737,13 +8728,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -8755,7 +8746,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,u,u,5,0,2,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] @@ -8765,7 +8756,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -8819,7 +8810,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,u,u,6,0,3,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 @@ -9396,7 +9387,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 @@ -9405,17 +9396,17 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,4,7,1,4,6,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 @@ -9475,7 +9466,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 @@ -9488,7 +9479,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,2,5,7,4,7,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 @@ -9506,7 +9497,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 @@ -9559,7 +9550,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,3,5,2,5,7,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 @@ -9577,7 +9568,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] @@ -9636,7 +9627,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] @@ -9669,13 +9660,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -9687,7 +9678,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,u,u,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] @@ -9697,7 +9688,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -9751,7 +9742,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,u,u,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 @@ -9832,13 +9823,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -9848,11 +9839,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -9862,11 +9853,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -9876,19 +9867,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -9896,7 +9887,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -9931,13 +9922,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -9947,11 +9938,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -9961,11 +9952,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -9975,19 +9966,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -9995,7 +9986,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -10030,13 +10021,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -10046,11 +10037,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -10060,11 +10051,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -10074,19 +10065,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -10094,7 +10085,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -10129,13 +10120,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -10145,11 +10136,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -10159,11 +10150,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -10173,19 +10164,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -10193,7 +10184,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index b9b3075a6d98d..6c978da50d53c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -580,13 +580,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] ; AVX512-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -612,13 +612,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] ; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -645,13 +645,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] ; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -677,13 +677,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -1441,17 +1441,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1467,17 +1467,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1493,17 +1493,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1519,17 +1519,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2126,7 +2126,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] @@ -2237,7 +2237,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] @@ -2345,7 +2345,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] @@ -2821,41 +2821,41 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -2877,41 +2877,41 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -2933,41 +2933,41 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -2989,41 +2989,41 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -4126,7 +4126,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm9, %ymm6, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4416,7 +4416,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4698,7 +4698,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5106,7 +5106,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] @@ -5388,7 +5388,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] @@ -5669,7 +5669,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] @@ -5945,7 +5945,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] @@ -6075,7 +6075,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -6089,7 +6089,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} @@ -6099,7 +6099,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -6113,11 +6113,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6129,7 +6129,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6163,7 +6163,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -6177,7 +6177,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} @@ -6187,7 +6187,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -6201,11 +6201,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6217,7 +6217,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6251,7 +6251,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -6265,7 +6265,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} @@ -6275,7 +6275,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -6289,11 +6289,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6305,7 +6305,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6339,7 +6339,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -6353,7 +6353,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} @@ -6363,7 +6363,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -6377,11 +6377,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6393,7 +6393,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -8635,7 +8635,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 @@ -9246,7 +9246,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 @@ -9836,7 +9836,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6 @@ -10660,7 +10660,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10998,7 +10998,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 ; AVX512-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 ; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 @@ -11289,7 +11289,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11620,7 +11620,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm4 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 @@ -11917,7 +11917,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12226,7 +12226,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 ; AVX512DQ-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 ; AVX512DQ-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 @@ -12521,7 +12521,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12822,7 +12822,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 @@ -12864,7 +12864,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 @@ -12886,7 +12886,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} @@ -12904,7 +12904,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 @@ -12926,7 +12926,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} @@ -12936,7 +12936,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] @@ -12956,7 +12956,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] @@ -13009,7 +13009,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 @@ -13031,7 +13031,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} @@ -13049,7 +13049,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 @@ -13071,7 +13071,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} @@ -13081,7 +13081,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] @@ -13101,7 +13101,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] @@ -13154,7 +13154,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 @@ -13176,7 +13176,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} @@ -13194,7 +13194,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 @@ -13216,7 +13216,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} @@ -13226,7 +13226,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] @@ -13246,7 +13246,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] @@ -13299,7 +13299,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 @@ -13321,7 +13321,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} @@ -13339,7 +13339,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 @@ -13361,7 +13361,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} @@ -13371,7 +13371,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] @@ -13391,7 +13391,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 85dd0dcd0d4da..ea3bf7b9b7203 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -1881,19 +1881,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1911,19 +1911,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1941,19 +1941,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1971,19 +1971,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2733,7 +2733,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2907,7 +2907,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3072,7 +3072,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3081,7 +3081,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,5,1,u,4,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,5,1,0,4,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vmovdqa %xmm12, %xmm11 @@ -3101,7 +3101,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,6,1,u,5,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,6,1,0,5,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 @@ -3141,7 +3141,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,2,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm5 @@ -3159,14 +3159,13 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0] -; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,0,3,7,0] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm14 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,4,7,3,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,7,3,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm8 @@ -3188,7 +3187,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,0,3,7,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,0,3,7,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] @@ -3388,17 +3387,15 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,6,9,0,13,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,9,u,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,9,0,12,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] @@ -3465,7 +3462,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 @@ -3487,7 +3484,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] @@ -3495,10 +3492,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,4,8,11,15,u,u,u] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,4,7,11,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,4,8,11,15,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,4,7,11,14,0,0,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm16, %zmm13 @@ -3710,17 +3706,15 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,9,u,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,6,9,0,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,9,u,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,9,0,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] @@ -3787,7 +3781,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,10,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9 @@ -3809,7 +3803,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] @@ -3817,10 +3811,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,4,8,11,15,u,u,u] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,4,7,11,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,4,8,11,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,4,7,11,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm16, %zmm13 @@ -3868,48 +3861,48 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -3934,48 +3927,48 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -4000,48 +3993,48 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] ; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] ; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -4066,48 +4059,48 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -5592,7 +5585,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm5, %ymm3 @@ -6012,7 +6005,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm1 @@ -6419,7 +6412,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm9 @@ -6434,7 +6427,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,1,u,4,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,1,0,4,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -6463,7 +6456,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,6,1,u,5,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,1,0,5,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -6633,7 +6626,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,2,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 @@ -6664,7 +6657,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] ; AVX2-FCP-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] @@ -6682,7 +6675,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm15 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 @@ -6691,7 +6684,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm11 @@ -6737,7 +6730,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,4,0,3,7,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,4,0,3,7,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 @@ -7158,26 +7151,23 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermd %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,u,u,u,4,8,11,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,0,0,4,8,11,15] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpermd %zmm14, %zmm16, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,u,u,u,4,7,11,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,7,11,14] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15] ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28 @@ -7268,7 +7258,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm18, %zmm11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,u,u,u,5,8,12,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,5,8,12,15] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] @@ -7301,7 +7291,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm10, %zmm10 @@ -7310,7 +7300,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,7,10,14,0,0,0] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] @@ -7336,7 +7326,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm23 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,u,0,3,7,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0] ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] @@ -7352,7 +7342,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,u,u,6,9,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] @@ -7363,10 +7353,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,13,2,6,9,13] ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,4,7,11,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,7,11,14,0,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] @@ -7388,7 +7377,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,u,u,u,6,10,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,0,6,10,13,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] @@ -7398,7 +7387,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,4,8,11,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,4,8,11,15,0,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 @@ -7416,7 +7405,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,10,3,14,7,10,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,10,3,14,7,10,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] @@ -7820,23 +7809,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,4,8,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,0,0,0,4,8,11,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,u,u,u,4,7,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,4,7,11,14] ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm19, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 @@ -7930,8 +7916,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,5,8,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,0,0,0,5,8,12,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm27, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] @@ -7969,7 +7955,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3,4,5],xmm14[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14 @@ -7986,7 +7972,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,7,10,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,7,10,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm8, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm14 @@ -8010,7 +7996,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,u,u,6,9,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm12, %zmm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -8022,7 +8008,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm18, %zmm12 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm11, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] @@ -8033,7 +8019,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,4,7,11,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] @@ -8055,7 +8041,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2],xmm0[3],xmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,u,u,u,6,10,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,0,0,0,6,10,13,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8066,7 +8052,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm19, %zmm12 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,4,8,11,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 @@ -8084,7 +8070,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] @@ -8125,7 +8111,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -8145,7 +8131,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -8163,7 +8149,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -8180,7 +8166,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} @@ -8193,7 +8179,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} @@ -8206,7 +8192,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} @@ -8219,7 +8205,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} @@ -8247,7 +8233,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -8267,7 +8253,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -8285,7 +8271,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -8302,7 +8288,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} @@ -8315,7 +8301,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} @@ -8328,7 +8314,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} @@ -8341,7 +8327,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} @@ -8369,7 +8355,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -8389,7 +8375,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -8407,7 +8393,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -8424,7 +8410,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} @@ -8437,7 +8423,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} @@ -8450,7 +8436,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} @@ -8463,7 +8449,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} @@ -8491,7 +8477,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -8511,7 +8497,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -8529,7 +8515,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -8546,7 +8532,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} @@ -8559,7 +8545,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} @@ -8572,7 +8558,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} @@ -8585,7 +8571,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} @@ -11532,7 +11518,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] @@ -11577,7 +11563,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %ymm5, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] @@ -11659,7 +11645,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] @@ -12419,7 +12405,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] @@ -12462,7 +12448,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] @@ -13276,7 +13262,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] @@ -13334,7 +13320,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,1,u,4,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,1,0,4,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8 @@ -13380,7 +13366,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,1,u,5,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,1,0,5,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8 @@ -13424,7 +13410,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -13722,7 +13708,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,3,7,2,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,3,7,2,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5 @@ -13811,7 +13797,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] @@ -13827,7 +13813,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] @@ -13934,7 +13920,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm10 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,0,3,7,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,4,0,3,7,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm6 @@ -14883,7 +14869,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm8 ; AVX512-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload @@ -14963,9 +14949,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,5,9,u,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm4 @@ -15061,7 +15047,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 @@ -15131,7 +15117,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10 @@ -15193,8 +15179,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13] ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm24, %zmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 @@ -15215,13 +15200,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] ; AVX512-FCP-NEXT: vpermd %ymm31, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm8 @@ -15241,7 +15225,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,u,u,u,4,7,11,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm6, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13 @@ -15270,8 +15254,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7 @@ -15312,7 +15295,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,u,u,u,4,8,11,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm11, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13 @@ -15343,7 +15326,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 @@ -15389,7 +15372,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 @@ -15428,7 +15411,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] @@ -15449,7 +15432,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [2,u,u,u,6,9,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm20, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 @@ -15502,7 +15485,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,4,7,11,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 @@ -15520,7 +15503,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [3,u,u,u,6,10,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -15569,7 +15552,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,8,11,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -15584,7 +15567,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm15 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm15 @@ -15618,7 +15601,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -16526,7 +16509,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -16565,9 +16548,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,5,9,u,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm4 @@ -16664,7 +16647,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,9,0,13,0,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 @@ -16735,7 +16718,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 @@ -16793,8 +16776,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm27, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 @@ -16815,13 +16797,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm30, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 @@ -16841,7 +16822,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,u,u,u,4,7,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,4,7,11,14] ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm14 @@ -16869,8 +16850,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] @@ -16909,7 +16889,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,u,u,u,4,8,11,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm10 @@ -16940,7 +16920,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 @@ -16995,7 +16975,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 @@ -17038,7 +17018,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,7,10,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] @@ -17058,7 +17038,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,6,9,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 @@ -17110,7 +17090,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,4,7,11,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm22, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 @@ -17127,7 +17107,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,u,u,u,6,10,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -17174,7 +17154,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,4,8,11,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 @@ -17190,7 +17170,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 @@ -17233,7 +17213,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -17289,7 +17269,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -17318,7 +17298,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17346,7 +17326,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17372,7 +17352,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17394,7 +17374,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17416,7 +17396,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17436,7 +17416,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} @@ -17485,7 +17465,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -17514,7 +17494,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17542,7 +17522,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17568,7 +17548,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17590,7 +17570,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17612,7 +17592,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17632,7 +17612,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} @@ -17681,7 +17661,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -17710,7 +17690,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17738,7 +17718,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17764,7 +17744,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17786,7 +17766,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17808,7 +17788,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17828,7 +17808,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} @@ -17877,7 +17857,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -17906,7 +17886,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17934,7 +17914,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17960,7 +17940,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17982,7 +17962,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -18004,7 +17984,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -18024,7 +18004,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 7d137d47217e0..1b637cd203c8f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -496,7 +496,7 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [3,7,3,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] ; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -527,11 +527,11 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,5,1,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,7,3,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -564,7 +564,7 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [3,7,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -595,11 +595,11 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,7,3,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1128,7 +1128,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -1147,7 +1147,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1190,7 +1190,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7 @@ -1202,18 +1202,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -1254,7 +1254,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -1273,7 +1273,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1316,7 +1316,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7 @@ -1328,18 +1328,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -1377,23 +1377,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1411,23 +1411,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1445,23 +1445,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1479,23 +1479,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2491,7 +2491,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] @@ -2556,7 +2556,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2659,7 +2659,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm13 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 @@ -2693,7 +2693,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -2709,7 +2709,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -2726,7 +2726,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2830,7 +2830,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] @@ -2895,7 +2895,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2998,7 +2998,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 @@ -3032,7 +3032,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -3048,7 +3048,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -3065,7 +3065,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -3169,49 +3169,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -3237,49 +3237,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -3305,49 +3305,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -3373,49 +3373,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -5744,7 +5744,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] @@ -5906,7 +5906,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 ; AVX512-NEXT: vmovdqa %xmm0, %xmm6 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -5945,7 +5945,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload @@ -6077,7 +6077,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -6134,7 +6134,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -6231,7 +6231,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm11 @@ -6262,7 +6262,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -6298,7 +6298,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -6335,7 +6335,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -6401,7 +6401,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -6430,7 +6430,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -6517,7 +6517,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] @@ -6679,7 +6679,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 ; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm6 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -6718,7 +6718,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload @@ -6850,7 +6850,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -6907,7 +6907,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -7004,7 +7004,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm11 @@ -7035,7 +7035,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -7071,7 +7071,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -7108,7 +7108,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -7174,7 +7174,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -7203,7 +7203,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -12690,7 +12690,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -13079,7 +13079,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -13152,7 +13152,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4] ; AVX512-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -13493,7 +13493,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -13600,7 +13600,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -13832,7 +13832,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -13905,7 +13905,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload @@ -14007,7 +14007,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -14080,7 +14080,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -14259,7 +14259,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -14326,7 +14326,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -14424,7 +14424,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] ; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -14530,7 +14530,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -14919,7 +14919,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -14992,7 +14992,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4] ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -15333,7 +15333,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512DQ-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -15440,7 +15440,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -15672,7 +15672,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -15745,7 +15745,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload @@ -15847,7 +15847,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -15920,7 +15920,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -16099,7 +16099,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -16166,7 +16166,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -16264,7 +16264,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index fadeaedf3dae3..7cb46b79f7f36 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -364,7 +364,7 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; AVX512-FCP-NEXT: vpmovqd %zmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rdx) @@ -386,7 +386,7 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vpmovqd %zmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rdx) @@ -408,7 +408,7 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; AVX512BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) @@ -430,7 +430,7 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) @@ -563,9 +563,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -576,9 +576,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -589,9 +589,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -602,9 +602,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -615,9 +615,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -628,9 +628,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -641,9 +641,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -654,9 +654,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -883,11 +883,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -903,11 +903,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -923,11 +923,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -943,11 +943,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -963,11 +963,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -983,11 +983,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1003,11 +1003,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1023,11 +1023,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1483,7 +1483,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1491,7 +1491,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1517,7 +1517,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1525,7 +1525,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1551,7 +1551,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1559,7 +1559,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1585,7 +1585,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1593,7 +1593,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1619,7 +1619,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1627,7 +1627,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1653,7 +1653,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1661,7 +1661,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1687,7 +1687,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1695,7 +1695,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1721,7 +1721,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1729,7 +1729,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index b462b5da0f017..a0ea6ddeca7df 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -106,7 +106,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] ; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -134,7 +134,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -162,7 +162,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -190,7 +190,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -309,13 +309,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride3_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -325,13 +325,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride3_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -341,13 +341,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride3_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -357,13 +357,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -373,13 +373,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -389,13 +389,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -405,13 +405,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -421,13 +421,13 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -609,11 +609,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -625,11 +625,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -641,11 +641,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -657,11 +657,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -673,11 +673,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -689,11 +689,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -705,11 +705,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -721,11 +721,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1048,17 +1048,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1071,17 +1071,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1094,17 +1094,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1117,17 +1117,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1140,17 +1140,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1163,17 +1163,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1186,17 +1186,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1209,17 +1209,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1930,23 +1930,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -1967,23 +1967,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2004,23 +2004,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2041,23 +2041,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2078,23 +2078,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2115,23 +2115,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2152,23 +2152,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2189,23 +2189,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -3751,10 +3751,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3764,10 +3764,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3777,9 +3777,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3816,10 +3816,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3829,10 +3829,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3842,9 +3842,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3881,10 +3881,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3894,10 +3894,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3907,9 +3907,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3946,10 +3946,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3959,10 +3959,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3972,9 +3972,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -4011,10 +4011,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -4024,10 +4024,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -4037,9 +4037,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -4076,10 +4076,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -4089,10 +4089,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -4102,9 +4102,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -4141,10 +4141,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -4154,10 +4154,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -4167,9 +4167,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -4206,10 +4206,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -4219,10 +4219,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -4232,9 +4232,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index eb05676ba09f1..3874581e621b3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -134,7 +134,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -162,7 +162,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -190,7 +190,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -360,15 +360,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride4_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -379,15 +379,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride4_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -398,15 +398,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride4_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -417,15 +417,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -436,15 +436,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride4_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -455,15 +455,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -474,15 +474,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -493,15 +493,15 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -760,15 +760,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride4_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -779,15 +779,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride4_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -798,15 +798,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride4_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -817,15 +817,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -836,15 +836,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride4_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -855,15 +855,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -874,15 +874,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -893,15 +893,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 4cfa5934c5cd2..f01aa90e3efc5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -388,16 +388,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -406,7 +406,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -422,16 +422,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -440,7 +440,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -456,16 +456,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -474,7 +474,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -490,16 +490,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512-NEXT: vmovdqa %xmm2, (%rdx) @@ -513,16 +513,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -536,16 +536,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rdx) @@ -559,16 +559,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -582,16 +582,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) @@ -605,16 +605,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -628,16 +628,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) @@ -651,16 +651,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,6,11,16] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -845,13 +845,13 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,3,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] @@ -861,7 +861,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 @@ -881,7 +881,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,1,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] @@ -905,13 +905,13 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,3,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] @@ -921,7 +921,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 @@ -941,7 +941,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,1,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] @@ -965,13 +965,13 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,6,3,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] @@ -981,7 +981,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 @@ -1001,7 +1001,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,1,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] @@ -1020,26 +1020,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -1053,26 +1053,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1086,26 +1086,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -1119,26 +1119,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1152,26 +1152,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1185,26 +1185,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1218,26 +1218,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1251,26 +1251,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1682,7 +1682,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm10 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 @@ -1699,7 +1699,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [1,6,3,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpermd %ymm11, %ymm7, %ymm11 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] @@ -1718,7 +1718,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] ; AVX2-NEXT: vpermd %ymm13, %ymm7, %ymm13 ; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 @@ -1755,7 +1755,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [4,1,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] ; AVX2-NEXT: vpermd %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7] @@ -1798,7 +1798,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vpermd %ymm10, %ymm7, %ymm10 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 @@ -1815,7 +1815,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,6,3,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm11, %ymm7, %ymm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] @@ -1834,7 +1834,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm7, %ymm13 ; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 @@ -1871,7 +1871,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,1,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm6, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7] @@ -1914,7 +1914,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm10 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 @@ -1931,7 +1931,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,6,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,6,3,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] @@ -1950,7 +1950,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm7, %ymm13 ; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 @@ -1987,7 +1987,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,1,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7] @@ -2028,22 +2028,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2051,25 +2051,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2089,22 +2089,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2112,25 +2112,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2150,22 +2150,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2173,25 +2173,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2211,22 +2211,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2234,25 +2234,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2272,22 +2272,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2295,25 +2295,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2333,22 +2333,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2356,25 +2356,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2394,22 +2394,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2417,25 +2417,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -2455,22 +2455,22 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2478,25 +2478,25 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) @@ -3373,7 +3373,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -3419,7 +3419,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] @@ -3463,7 +3463,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm1 @@ -3567,7 +3567,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,1,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] @@ -3662,7 +3662,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -3708,7 +3708,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] @@ -3752,7 +3752,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm1 @@ -3856,7 +3856,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,1,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] @@ -3951,7 +3951,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -3997,7 +3997,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] @@ -4041,7 +4041,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 @@ -4145,7 +4145,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,1,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] @@ -4235,13 +4235,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4251,17 +4251,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4271,13 +4271,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4285,7 +4285,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4294,12 +4294,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4334,13 +4334,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4350,17 +4350,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4370,13 +4370,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4384,7 +4384,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4393,12 +4393,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4433,13 +4433,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4449,17 +4449,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4469,13 +4469,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4483,7 +4483,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4492,12 +4492,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4532,13 +4532,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4548,17 +4548,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4568,13 +4568,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4582,7 +4582,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4591,12 +4591,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4631,13 +4631,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4647,17 +4647,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4667,13 +4667,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4681,7 +4681,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4690,12 +4690,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4730,13 +4730,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4746,17 +4746,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4766,13 +4766,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4780,7 +4780,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4789,12 +4789,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4829,13 +4829,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4845,17 +4845,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4865,13 +4865,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4879,7 +4879,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4888,12 +4888,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4928,13 +4928,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4944,17 +4944,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4964,13 +4964,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4978,7 +4978,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4987,12 +4987,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -6794,7 +6794,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -6903,7 +6903,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] @@ -6998,7 +6998,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpbroadcastd 1104(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [2,7,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 @@ -7194,7 +7194,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] ; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] @@ -7370,7 +7370,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -7479,7 +7479,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] @@ -7574,7 +7574,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpbroadcastd 1104(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,7,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 @@ -7770,7 +7770,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] ; AVX2-FP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] @@ -7946,7 +7946,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 @@ -8055,7 +8055,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,6,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,6,3,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] @@ -8150,7 +8150,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastd 1104(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,7,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 @@ -8346,7 +8346,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] @@ -8517,7 +8517,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8532,7 +8532,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8542,7 +8542,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8574,12 +8574,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -8620,7 +8620,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -8636,7 +8636,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -8648,7 +8648,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -8657,7 +8657,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -8666,7 +8666,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -8721,7 +8721,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8736,7 +8736,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8746,7 +8746,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8778,12 +8778,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -8824,7 +8824,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -8840,7 +8840,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -8852,7 +8852,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -8861,7 +8861,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -8870,7 +8870,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -8925,7 +8925,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8940,7 +8940,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8950,7 +8950,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8982,12 +8982,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9028,7 +9028,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9044,7 +9044,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9056,7 +9056,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9065,7 +9065,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9074,7 +9074,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9129,7 +9129,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9144,7 +9144,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9154,7 +9154,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9186,12 +9186,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9232,7 +9232,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9248,7 +9248,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9260,7 +9260,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9269,7 +9269,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9278,7 +9278,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9333,7 +9333,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9348,7 +9348,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9358,7 +9358,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9390,12 +9390,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9436,7 +9436,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9452,7 +9452,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9464,7 +9464,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9473,7 +9473,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9482,7 +9482,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9537,7 +9537,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9552,7 +9552,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9562,7 +9562,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9594,12 +9594,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9640,7 +9640,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9656,7 +9656,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9668,7 +9668,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9677,7 +9677,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9686,7 +9686,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9741,7 +9741,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9756,7 +9756,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9766,7 +9766,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9798,12 +9798,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9844,7 +9844,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9860,7 +9860,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9872,7 +9872,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9881,7 +9881,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9890,7 +9890,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9945,7 +9945,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9960,7 +9960,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9970,7 +9970,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10002,12 +10002,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -10048,7 +10048,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -10064,7 +10064,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -10076,7 +10076,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -10085,7 +10085,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -10094,7 +10094,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 491b299252699..aae4d9fa15e24 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -201,7 +201,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] ; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm3 @@ -261,7 +261,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] ; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm3 @@ -321,7 +321,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] ; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 @@ -381,7 +381,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 @@ -515,13 +515,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] ; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-NEXT: vmovdqa (%rdi), %xmm5 @@ -540,13 +540,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0] ; AVX2-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) @@ -563,13 +563,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 @@ -588,13 +588,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-FP-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi) @@ -611,13 +611,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 @@ -636,13 +636,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi) @@ -659,17 +659,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -685,17 +685,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -711,17 +711,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -737,17 +737,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -763,17 +763,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -789,17 +789,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -815,17 +815,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -841,17 +841,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1330,34 +1330,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) @@ -1376,34 +1374,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) @@ -1422,34 +1418,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) @@ -1468,34 +1462,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) @@ -1514,34 +1506,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm7, (%rdx) @@ -1560,34 +1550,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) @@ -1606,34 +1594,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rdx) @@ -1652,34 +1638,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) @@ -2753,7 +2737,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k2 @@ -2767,14 +2751,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2786,7 +2770,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2799,7 +2783,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 @@ -2813,7 +2797,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -2841,7 +2825,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 @@ -2855,14 +2839,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2874,7 +2858,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2887,7 +2871,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -2901,7 +2885,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -2929,7 +2913,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 @@ -2943,14 +2927,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2962,7 +2946,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2975,7 +2959,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -2989,7 +2973,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3017,7 +3001,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 @@ -3031,14 +3015,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3050,7 +3034,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3063,7 +3047,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -3077,7 +3061,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3105,7 +3089,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -3119,14 +3103,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3138,7 +3122,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3151,7 +3135,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -3165,7 +3149,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3193,7 +3177,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -3207,14 +3191,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3226,7 +3210,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3239,7 +3223,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -3253,7 +3237,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3281,7 +3265,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -3295,14 +3279,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3314,7 +3298,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3327,7 +3311,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -3341,7 +3325,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3369,7 +3353,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -3383,14 +3367,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3402,7 +3386,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3415,7 +3399,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -3429,7 +3413,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -5776,7 +5760,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512-NEXT: movb $56, %dil @@ -5798,7 +5782,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -5812,7 +5796,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -5832,7 +5816,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -5854,7 +5838,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 @@ -5875,7 +5859,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -5921,7 +5905,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512-FCP-NEXT: movb $56, %dil @@ -5943,7 +5927,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -5957,7 +5941,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -5977,7 +5961,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -5999,7 +5983,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -6020,7 +6004,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6066,7 +6050,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-NEXT: movb $56, %dil @@ -6088,7 +6072,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6102,7 +6086,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6122,7 +6106,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6144,7 +6128,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 @@ -6165,7 +6149,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6211,7 +6195,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movb $56, %dil @@ -6233,7 +6217,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6247,7 +6231,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6267,7 +6251,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6289,7 +6273,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -6310,7 +6294,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6356,7 +6340,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512BW-NEXT: movb $56, %dil @@ -6378,7 +6362,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6392,7 +6376,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6412,7 +6396,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6434,7 +6418,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -6455,7 +6439,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6501,7 +6485,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movb $56, %dil @@ -6523,7 +6507,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6537,7 +6521,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6557,7 +6541,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6579,7 +6563,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -6600,7 +6584,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6646,7 +6630,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movb $56, %dil @@ -6668,7 +6652,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6682,7 +6666,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6702,7 +6686,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6724,7 +6708,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -6745,7 +6729,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -6791,7 +6775,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil @@ -6813,7 +6797,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} @@ -6827,7 +6811,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] @@ -6847,7 +6831,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] @@ -6869,7 +6853,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -6890,7 +6874,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -11941,24 +11925,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -12291,24 +12275,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -12641,24 +12625,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -12991,24 +12975,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -13341,24 +13325,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -13691,24 +13675,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -14041,24 +14025,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 @@ -14391,24 +14375,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index ef7d9a0e1556d..b49c35e081298 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -203,7 +203,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -234,9 +234,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -272,7 +272,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -303,9 +303,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -341,7 +341,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -372,9 +372,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -410,7 +410,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -441,9 +441,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -777,19 +777,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -807,19 +807,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -837,19 +837,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -867,19 +867,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -897,19 +897,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -927,19 +927,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -957,19 +957,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -987,19 +987,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1286,7 +1286,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1333,7 +1333,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1389,7 +1389,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1436,7 +1436,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1492,7 +1492,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1511,8 +1511,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,0,7,0,1,0,7,0] -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [1,7,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] @@ -1540,7 +1539,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1551,8 +1550,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,0,7,0,1,0,7,0] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,1,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] @@ -1593,46 +1591,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm4, (%rsi) @@ -1653,46 +1644,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1713,46 +1697,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) @@ -1773,46 +1750,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1833,46 +1803,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1893,46 +1856,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1953,46 +1909,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -2013,46 +1962,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] -; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] -; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -2674,7 +2616,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] ; AVX2-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3 @@ -2783,7 +2725,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [4,3,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -2900,7 +2842,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm3 @@ -3009,7 +2951,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [4,3,0,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-FP-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -3126,7 +3068,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,7,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,7,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] @@ -3164,7 +3106,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm14 @@ -3233,7 +3175,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = [4,3,0,0] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,3,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm12 @@ -3260,7 +3202,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] @@ -3352,7 +3294,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 @@ -3382,7 +3324,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-NEXT: kmovw %edi, %k2 @@ -3399,7 +3341,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3409,7 +3351,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -3425,7 +3367,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3433,7 +3375,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3441,7 +3383,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3469,7 +3411,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -3499,7 +3441,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-FCP-NEXT: kmovw %edi, %k2 @@ -3516,7 +3458,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3526,7 +3468,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -3542,7 +3484,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3550,7 +3492,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3558,7 +3500,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3586,7 +3528,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -3616,7 +3558,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-NEXT: kmovw %edi, %k2 @@ -3633,7 +3575,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3643,7 +3585,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -3659,7 +3601,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3667,7 +3609,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3675,7 +3617,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3703,7 +3645,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -3733,7 +3675,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 @@ -3750,7 +3692,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3760,7 +3702,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -3776,7 +3718,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3784,7 +3726,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3792,7 +3734,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3820,7 +3762,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -3850,7 +3792,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -3867,7 +3809,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3877,7 +3819,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -3893,7 +3835,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3901,7 +3843,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3909,7 +3851,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3937,7 +3879,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -3967,7 +3909,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -3984,7 +3926,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3994,7 +3936,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -4010,7 +3952,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -4018,7 +3960,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -4026,7 +3968,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -4054,7 +3996,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -4084,7 +4026,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -4101,7 +4043,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -4111,7 +4053,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -4127,7 +4069,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -4135,7 +4077,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -4143,7 +4085,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -4171,7 +4113,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -4201,7 +4143,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -4218,7 +4160,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -4228,7 +4170,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] @@ -4244,7 +4186,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -4252,7 +4194,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -4260,7 +4202,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -5587,7 +5529,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5858,7 +5800,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -6103,7 +6045,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6374,7 +6316,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -6622,7 +6564,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 @@ -6707,7 +6649,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 @@ -6884,7 +6826,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,3,0,0] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,3,0,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] @@ -6946,7 +6888,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] @@ -7131,7 +7073,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 @@ -7157,7 +7099,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 @@ -7184,7 +7126,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7207,7 +7149,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7236,7 +7178,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7256,7 +7198,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7274,7 +7216,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7323,7 +7265,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -7349,7 +7291,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -7376,7 +7318,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7399,7 +7341,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7428,7 +7370,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7448,7 +7390,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7466,7 +7408,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7515,7 +7457,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 @@ -7541,7 +7483,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 @@ -7568,7 +7510,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7591,7 +7533,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7620,7 +7562,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7640,7 +7582,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7658,7 +7600,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7707,7 +7649,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -7733,7 +7675,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -7760,7 +7702,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7783,7 +7725,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7812,7 +7754,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7832,7 +7774,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7850,7 +7792,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7899,7 +7841,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -7925,7 +7867,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 @@ -7952,7 +7894,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7975,7 +7917,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8004,7 +7946,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8024,7 +7966,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8042,7 +7984,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -8091,7 +8033,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -8117,7 +8059,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -8144,7 +8086,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -8167,7 +8109,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8196,7 +8138,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8216,7 +8158,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8234,7 +8176,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -8283,7 +8225,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 @@ -8309,7 +8251,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 @@ -8336,7 +8278,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -8359,7 +8301,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8388,7 +8330,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8408,7 +8350,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8426,7 +8368,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -8475,7 +8417,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 @@ -8501,7 +8443,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -8528,7 +8470,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -8551,7 +8493,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8580,7 +8522,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8600,7 +8542,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8618,7 +8560,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -11335,7 +11277,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12387,7 +12329,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13437,7 +13379,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm11 @@ -13608,7 +13550,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 @@ -14701,27 +14643,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -15153,27 +15095,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -15605,27 +15547,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -16057,27 +15999,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -16509,27 +16451,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -16961,27 +16903,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -17413,27 +17355,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -17865,27 +17807,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index ed7f9a08140a3..2fd173c729170 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -219,7 +219,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -286,7 +286,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -353,7 +353,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -420,7 +420,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -752,23 +752,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -786,23 +786,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -820,23 +820,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -854,23 +854,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -888,23 +888,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -922,23 +922,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -956,23 +956,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -990,23 +990,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1590,52 +1590,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm4, (%rsi) @@ -1658,52 +1650,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1726,52 +1710,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) @@ -1794,52 +1770,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1862,52 +1830,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1930,52 +1890,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1998,52 +1950,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -2066,52 +2010,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] -; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] -; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index b008338bb5a0f..2381df6d73289 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -244,11 +244,11 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i64_stride2_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -270,11 +270,11 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -296,11 +296,11 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -322,11 +322,11 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -462,9 +462,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -475,9 +475,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -488,9 +488,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -501,9 +501,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -514,9 +514,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -527,9 +527,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -540,9 +540,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -553,9 +553,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -785,11 +785,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -805,11 +805,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -825,11 +825,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -845,11 +845,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -865,11 +865,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -885,11 +885,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -905,11 +905,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -925,11 +925,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1388,7 +1388,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1396,7 +1396,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1422,7 +1422,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1430,7 +1430,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1456,7 +1456,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1464,7 +1464,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1490,7 +1490,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1498,7 +1498,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1524,7 +1524,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1532,7 +1532,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1558,7 +1558,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1566,7 +1566,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1592,7 +1592,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1600,7 +1600,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1626,7 +1626,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1634,7 +1634,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -2667,7 +2667,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2683,7 +2683,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2729,7 +2729,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2745,7 +2745,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2791,7 +2791,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2807,7 +2807,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2853,7 +2853,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2869,7 +2869,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2915,7 +2915,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2931,7 +2931,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2977,7 +2977,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2993,7 +2993,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -3039,7 +3039,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -3055,7 +3055,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -3101,7 +3101,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -3117,7 +3117,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index a58b090fbeafe..4e9440140592e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -308,11 +308,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -324,11 +324,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -340,11 +340,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -356,11 +356,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -372,11 +372,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -388,11 +388,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -404,11 +404,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -420,11 +420,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -639,17 +639,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -662,17 +662,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -685,17 +685,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -708,17 +708,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -731,17 +731,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -754,17 +754,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -777,17 +777,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -800,17 +800,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1216,23 +1216,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1253,23 +1253,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1290,23 +1290,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1327,23 +1327,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1364,23 +1364,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1401,23 +1401,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1438,23 +1438,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1475,23 +1475,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -2388,10 +2388,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2401,10 +2401,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2414,9 +2414,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2453,10 +2453,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2466,10 +2466,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2479,9 +2479,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2518,10 +2518,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2531,10 +2531,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2544,9 +2544,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2583,10 +2583,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2596,10 +2596,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2609,9 +2609,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2648,10 +2648,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2661,10 +2661,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2674,9 +2674,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2713,10 +2713,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2726,10 +2726,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2739,9 +2739,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2778,10 +2778,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2791,10 +2791,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2804,9 +2804,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2843,10 +2843,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2856,10 +2856,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2869,9 +2869,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -4880,7 +4880,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -4895,10 +4895,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -4923,7 +4923,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -4934,7 +4934,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -4943,7 +4943,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5003,7 +5003,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5018,10 +5018,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5046,7 +5046,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5057,7 +5057,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5066,7 +5066,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5126,7 +5126,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5141,10 +5141,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5169,7 +5169,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5180,7 +5180,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5189,7 +5189,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5249,7 +5249,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5264,10 +5264,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5292,7 +5292,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5303,7 +5303,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5312,7 +5312,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5372,7 +5372,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5387,10 +5387,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5415,7 +5415,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5426,7 +5426,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5435,7 +5435,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5495,7 +5495,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5510,10 +5510,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5538,7 +5538,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5549,7 +5549,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5558,7 +5558,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5618,7 +5618,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5633,10 +5633,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5661,7 +5661,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5672,7 +5672,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5681,7 +5681,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5741,7 +5741,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5756,10 +5756,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5784,7 +5784,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5795,7 +5795,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5804,7 +5804,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index a4a518cb4be85..f27619738a0ea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -478,22 +478,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) @@ -508,22 +508,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -538,22 +538,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512DQ-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) @@ -568,22 +568,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -598,22 +598,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512BW-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) @@ -628,22 +628,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -658,22 +658,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) @@ -688,22 +688,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -1115,33 +1115,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1173,33 +1173,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1231,33 +1231,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1289,33 +1289,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1347,33 +1347,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1405,33 +1405,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1463,33 +1463,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1521,33 +1521,33 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -2451,18 +2451,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2472,7 +2472,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2482,13 +2482,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2498,11 +2498,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2513,7 +2513,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2548,18 +2548,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2569,7 +2569,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2579,13 +2579,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2595,11 +2595,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2610,7 +2610,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2645,18 +2645,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2666,7 +2666,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2676,13 +2676,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2692,11 +2692,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2707,7 +2707,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2742,18 +2742,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2763,7 +2763,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2773,13 +2773,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2789,11 +2789,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2804,7 +2804,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2839,18 +2839,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2860,7 +2860,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2870,13 +2870,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2886,11 +2886,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2901,7 +2901,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2936,18 +2936,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2957,7 +2957,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2967,13 +2967,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2983,11 +2983,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -2998,7 +2998,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -3033,18 +3033,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -3054,7 +3054,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -3064,13 +3064,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -3080,11 +3080,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -3095,7 +3095,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -3130,18 +3130,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -3151,7 +3151,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -3161,13 +3161,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -3177,11 +3177,11 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 @@ -3192,7 +3192,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -5191,7 +5191,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5205,7 +5205,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5223,7 +5223,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5253,7 +5253,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5293,7 +5293,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5308,7 +5308,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5322,7 +5322,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5332,14 +5332,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5392,7 +5392,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5406,7 +5406,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5424,7 +5424,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5454,7 +5454,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5494,7 +5494,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5509,7 +5509,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5523,7 +5523,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5533,14 +5533,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5593,7 +5593,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5607,7 +5607,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5625,7 +5625,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5655,7 +5655,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5695,7 +5695,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5710,7 +5710,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5724,7 +5724,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5734,14 +5734,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5794,7 +5794,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5808,7 +5808,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5826,7 +5826,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5856,7 +5856,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5896,7 +5896,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5911,7 +5911,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5925,7 +5925,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5935,14 +5935,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5995,7 +5995,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6009,7 +6009,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6027,7 +6027,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6057,7 +6057,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6097,7 +6097,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6112,7 +6112,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6126,7 +6126,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6136,14 +6136,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6196,7 +6196,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6210,7 +6210,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6228,7 +6228,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6258,7 +6258,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6298,7 +6298,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6313,7 +6313,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6327,7 +6327,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6337,14 +6337,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6397,7 +6397,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6411,7 +6411,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6429,7 +6429,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6459,7 +6459,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6499,7 +6499,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6514,7 +6514,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6528,7 +6528,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6538,14 +6538,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6598,7 +6598,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6612,7 +6612,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6630,7 +6630,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6660,7 +6660,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6700,7 +6700,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6715,7 +6715,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6729,7 +6729,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6739,14 +6739,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -11095,18 +11095,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11209,7 +11209,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -11256,7 +11256,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -11295,7 +11295,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -11327,7 +11327,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -11339,7 +11339,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -11563,18 +11563,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11677,7 +11677,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -11724,7 +11724,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -11763,7 +11763,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -11795,7 +11795,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -11807,7 +11807,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -12031,18 +12031,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12145,7 +12145,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -12192,7 +12192,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -12231,7 +12231,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -12263,7 +12263,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -12275,7 +12275,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -12499,18 +12499,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12613,7 +12613,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -12660,7 +12660,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -12699,7 +12699,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -12731,7 +12731,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -12743,7 +12743,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -12967,18 +12967,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13081,7 +13081,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -13128,7 +13128,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -13167,7 +13167,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -13199,7 +13199,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -13211,7 +13211,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -13435,18 +13435,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13549,7 +13549,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -13596,7 +13596,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -13635,7 +13635,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -13667,7 +13667,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -13679,7 +13679,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -13903,18 +13903,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14017,7 +14017,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -14064,7 +14064,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -14103,7 +14103,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -14135,7 +14135,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -14147,7 +14147,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} @@ -14371,18 +14371,18 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,0,5,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14485,7 +14485,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 @@ -14532,7 +14532,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 @@ -14571,7 +14571,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 @@ -14603,7 +14603,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 @@ -14615,7 +14615,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 911e15491899e..21e1b17760c24 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -563,32 +563,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) @@ -605,32 +605,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [11,1,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] ; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -647,32 +646,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) @@ -689,32 +688,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [11,1,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -731,32 +729,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512BW-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -773,32 +771,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [11,1,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] ; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -815,32 +812,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -857,32 +854,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [11,1,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -1374,7 +1370,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k1 @@ -1401,7 +1397,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1411,7 +1407,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: movb $24, %dil ; AVX512-NEXT: kmovw %edi, %k2 @@ -1422,7 +1418,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1459,7 +1455,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -1486,7 +1482,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1496,7 +1492,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: movb $24, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 @@ -1507,7 +1503,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1544,7 +1540,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -1571,7 +1567,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1581,7 +1577,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: movb $24, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 @@ -1592,7 +1588,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1629,7 +1625,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -1656,7 +1652,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1666,7 +1662,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: movb $24, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 @@ -1677,7 +1673,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1714,7 +1710,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -1741,7 +1737,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1751,7 +1747,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -1762,7 +1758,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1799,7 +1795,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -1826,7 +1822,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1836,7 +1832,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: movb $24, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -1847,7 +1843,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1884,7 +1880,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -1911,7 +1907,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1921,7 +1917,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: movb $24, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -1932,7 +1928,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1969,7 +1965,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -1996,7 +1992,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -2006,7 +2002,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $24, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -2017,7 +2013,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -3164,7 +3160,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512-NEXT: movb $56, %dil @@ -3187,7 +3183,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3206,7 +3202,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: movb $24, %dil @@ -3228,7 +3224,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -3307,7 +3303,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512-FCP-NEXT: movb $56, %dil @@ -3330,7 +3326,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3349,7 +3345,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: movb $24, %dil @@ -3371,7 +3367,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -3450,7 +3446,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-NEXT: movb $56, %dil @@ -3473,7 +3469,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3492,7 +3488,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: movb $24, %dil @@ -3514,7 +3510,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -3593,7 +3589,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: movb $56, %dil @@ -3616,7 +3612,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3635,7 +3631,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: movb $24, %dil @@ -3657,7 +3653,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -3736,7 +3732,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512BW-NEXT: movb $56, %dil @@ -3759,7 +3755,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3778,7 +3774,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: movb $24, %dil @@ -3800,7 +3796,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -3879,7 +3875,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: movb $56, %dil @@ -3902,7 +3898,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3921,7 +3917,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movb $24, %dil @@ -3943,7 +3939,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -4022,7 +4018,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: movb $56, %dil @@ -4045,7 +4041,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -4064,7 +4060,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movb $24, %dil @@ -4086,7 +4082,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -4165,7 +4161,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil @@ -4188,7 +4184,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -4207,7 +4203,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $24, %dil @@ -4229,7 +4225,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} @@ -6788,7 +6784,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6807,7 +6803,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6868,11 +6864,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7140,7 +7136,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7159,7 +7155,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7220,11 +7216,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7492,7 +7488,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7511,7 +7507,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7572,11 +7568,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7844,7 +7840,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7863,7 +7859,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7924,11 +7920,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8196,7 +8192,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8215,7 +8211,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8276,11 +8272,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8548,7 +8544,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8567,7 +8563,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8628,11 +8624,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8900,7 +8896,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8919,7 +8915,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8980,11 +8976,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9252,7 +9248,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9271,7 +9267,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9332,11 +9328,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15070,19 +15066,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15826,19 +15822,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16582,19 +16578,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17338,19 +17334,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18094,19 +18090,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18850,19 +18846,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19606,19 +19602,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20362,19 +20358,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 38e05657923ec..1d1da0954d675 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -649,10 +649,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -671,18 +671,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) @@ -703,15 +702,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -725,18 +723,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -757,10 +754,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -779,18 +776,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) @@ -811,15 +807,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -833,18 +828,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -865,10 +859,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -887,18 +881,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -919,15 +912,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -941,18 +933,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -973,10 +964,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -995,18 +986,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -1027,15 +1017,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -1049,18 +1038,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -1655,12 +1643,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1690,8 +1678,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -1765,12 +1752,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1800,8 +1787,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -1875,12 +1861,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1910,8 +1896,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -1985,12 +1970,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -2020,8 +2005,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -2092,12 +2076,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -2127,8 +2111,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -2202,12 +2185,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -2237,8 +2220,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11] -; AVX512BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -2312,12 +2294,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -2347,8 +2329,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11] -; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -2422,12 +2403,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -2457,8 +2438,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} @@ -3840,7 +3820,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -3860,7 +3840,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -3878,10 +3858,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4028,7 +4007,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -4048,7 +4027,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -4066,10 +4045,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4216,7 +4194,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -4236,7 +4214,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -4254,10 +4232,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4404,7 +4381,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -4424,7 +4401,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -4442,10 +4419,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4592,7 +4568,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4612,7 +4588,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -4630,10 +4606,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4780,7 +4755,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4800,7 +4775,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -4818,10 +4793,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -4968,7 +4942,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4988,7 +4962,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -5006,10 +4980,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -5156,7 +5129,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -5176,7 +5149,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] @@ -5194,10 +5167,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 @@ -8038,8 +8010,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] -; AVX512-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] ; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -8086,7 +8057,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -8126,11 +8097,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm0 ; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 @@ -8522,8 +8493,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] -; AVX512-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -8570,7 +8540,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -8610,11 +8580,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 @@ -9006,8 +8976,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] -; AVX512DQ-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -9054,7 +9023,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -9094,11 +9063,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 @@ -9490,8 +9459,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] -; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -9538,7 +9506,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -9578,11 +9546,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 @@ -9964,8 +9932,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -10011,7 +9978,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -10049,10 +10016,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 @@ -10439,8 +10406,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -10486,7 +10452,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -10524,10 +10490,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 @@ -10914,8 +10880,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -10961,7 +10926,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm13 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -10999,10 +10964,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 @@ -11389,8 +11354,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11436,7 +11400,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -11474,10 +11438,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 @@ -17386,7 +17350,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] @@ -17567,7 +17531,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 2880(%rdi), %ymm4 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -17949,8 +17913,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] ; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18009,7 +17972,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -18386,7 +18349,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] @@ -18567,7 +18530,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -18949,8 +18912,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19009,7 +18971,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -19386,7 +19348,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] @@ -19567,7 +19529,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 2880(%rdi), %ymm4 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -19949,8 +19911,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20009,7 +19970,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -20386,7 +20347,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] @@ -20567,7 +20528,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -20949,8 +20910,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21009,7 +20969,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -21385,7 +21345,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -21562,7 +21522,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21936,8 +21896,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -21997,7 +21956,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -22368,7 +22327,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -22545,7 +22504,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22919,8 +22878,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -22980,7 +22938,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -23351,7 +23309,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -23528,7 +23486,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23902,8 +23860,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -23963,7 +23920,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -24334,7 +24291,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -24511,7 +24468,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24885,8 +24842,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -24946,7 +24902,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 4bceed858a3b8..ceb4948726760 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -738,28 +738,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovaps %ymm7, (%rsi) @@ -798,28 +794,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -858,28 +850,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovaps %ymm7, (%rsi) @@ -918,28 +906,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -978,28 +962,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovaps %ymm7, (%rsi) @@ -1038,28 +1018,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -1098,28 +1074,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovaps %ymm7, (%rsi) @@ -1158,28 +1130,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm7, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 86d7b498dfe29..0db78440d3aa7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -715,7 +715,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,5,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7] ; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] @@ -748,7 +748,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] @@ -1045,7 +1045,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 @@ -1106,7 +1106,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 @@ -1128,7 +1128,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] @@ -1144,7 +1144,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] @@ -1160,7 +1160,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] @@ -1176,7 +1176,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index 9f7959f0569a1..faecad65c395b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -770,7 +770,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -792,7 +792,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -814,7 +814,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] ; AVX2-FP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -836,7 +836,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] ; AVX2-FCP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 75fa3c29552ad..15f6ef4006fdd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -1679,7 +1679,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 @@ -1719,7 +1719,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 @@ -1759,7 +1759,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 @@ -1799,7 +1799,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 @@ -1834,7 +1834,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i8_stride4_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1862,7 +1862,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1890,7 +1890,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1918,7 +1918,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -3227,7 +3227,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3297,7 +3297,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3367,7 +3367,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3437,7 +3437,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3512,7 +3512,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512BW-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512BW-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3562,7 +3562,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3612,7 +3612,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3662,7 +3662,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm5, %zmm3, %zmm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index d4e61dcdebb83..e05b5ab9ebe02 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -1459,7 +1459,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] @@ -1470,7 +1470,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] ; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 @@ -1479,7 +1479,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] @@ -1488,7 +1488,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 @@ -1497,7 +1497,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] ; AVX2-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] @@ -1517,7 +1517,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] @@ -1528,7 +1528,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] ; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 @@ -1537,7 +1537,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] @@ -1546,7 +1546,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm7 @@ -1555,7 +1555,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] ; AVX2-FP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] @@ -1575,7 +1575,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] @@ -1586,7 +1586,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 @@ -1595,7 +1595,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] @@ -1604,7 +1604,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 @@ -1613,7 +1613,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] @@ -2793,7 +2793,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15 -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] ; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] @@ -2833,20 +2833,20 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] ; AVX2-NEXT: vmovdqa %xmm8, %xmm7 ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 ; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 @@ -2854,7 +2854,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] @@ -2867,7 +2867,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] @@ -2875,7 +2875,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] @@ -2906,7 +2906,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] @@ -2945,20 +2945,20 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm7 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 @@ -2966,7 +2966,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] @@ -2979,7 +2979,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] @@ -2987,7 +2987,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] @@ -3018,7 +3018,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] @@ -3057,20 +3057,20 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] ; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm7 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 @@ -3078,7 +3078,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] @@ -3091,7 +3091,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] @@ -3099,7 +3099,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] @@ -3130,7 +3130,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] @@ -3248,7 +3248,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX512-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -3358,7 +3358,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -3468,7 +3468,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX512DQ-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -3578,7 +3578,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -5480,7 +5480,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255] ; AVX-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5511,7 +5511,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255] ; AVX-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5610,7 +5610,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] @@ -5619,7 +5619,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5630,7 +5630,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5644,7 +5644,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5658,7 +5658,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 ; AVX2-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -5670,7 +5670,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm7 @@ -5701,7 +5701,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] @@ -5724,7 +5724,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5 @@ -5746,7 +5746,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -5758,7 +5758,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 @@ -5802,7 +5802,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm14 ; AVX2-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm15 @@ -5876,7 +5876,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] @@ -5885,7 +5885,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5896,7 +5896,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5910,7 +5910,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -5924,7 +5924,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -5936,7 +5936,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 @@ -5967,7 +5967,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] @@ -5990,7 +5990,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 @@ -6012,7 +6012,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -6024,7 +6024,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 @@ -6068,7 +6068,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 ; AVX2-FP-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 @@ -6142,7 +6142,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] @@ -6151,7 +6151,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -6162,7 +6162,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -6176,7 +6176,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -6190,7 +6190,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -6202,7 +6202,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 @@ -6233,7 +6233,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] @@ -6256,7 +6256,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 @@ -6278,7 +6278,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -6290,7 +6290,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 @@ -6334,7 +6334,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 ; AVX2-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 @@ -6472,7 +6472,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm10, %ymm0 @@ -6564,7 +6564,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12 @@ -6677,7 +6677,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512-FCP-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 @@ -6769,7 +6769,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12 @@ -6882,7 +6882,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512DQ-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512DQ-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 @@ -6974,7 +6974,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12 @@ -7087,7 +7087,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512DQ-FCP-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 @@ -7179,7 +7179,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 8a48db406c702..c77b232fde969 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -1701,7 +1701,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] ; AVX-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] @@ -1773,7 +1773,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1784,7 +1784,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -1793,7 +1793,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1810,7 +1810,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] @@ -1841,7 +1841,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1852,7 +1852,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -1861,7 +1861,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1878,7 +1878,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] @@ -1909,7 +1909,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1920,7 +1920,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -1929,7 +1929,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1946,7 +1946,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] @@ -1989,7 +1989,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] ; AVX512-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -2057,7 +2057,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] ; AVX512-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -2125,7 +2125,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] ; AVX512DQ-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -2193,7 +2193,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] @@ -3420,31 +3420,31 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 ; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 @@ -3460,9 +3460,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] ; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] ; AVX2-NEXT: vpor %xmm8, %xmm9, %xmm8 @@ -3476,7 +3476,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] @@ -3527,31 +3527,31 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 @@ -3567,9 +3567,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] ; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] ; AVX2-FP-NEXT: vpor %xmm8, %xmm9, %xmm8 @@ -3583,7 +3583,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] @@ -3634,31 +3634,31 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 @@ -3674,9 +3674,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] ; AVX2-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 @@ -3690,7 +3690,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] @@ -3796,7 +3796,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] @@ -3906,7 +3906,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] @@ -4016,7 +4016,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] @@ -4126,7 +4126,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] @@ -6535,14 +6535,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill @@ -6552,7 +6552,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX2-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm8 @@ -6585,7 +6585,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -6595,7 +6595,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm5 ; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: vpshufb %ymm14, %ymm3, %ymm15 @@ -6627,10 +6627,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 ; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 @@ -6643,7 +6643,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 @@ -6655,7 +6655,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm9 ; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 ; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 @@ -6808,14 +6808,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill @@ -6825,7 +6825,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX2-FP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8 @@ -6858,7 +6858,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -6868,7 +6868,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 @@ -6900,10 +6900,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm4 @@ -6916,7 +6916,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 @@ -6928,7 +6928,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 ; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 ; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 @@ -7081,14 +7081,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill @@ -7098,7 +7098,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX2-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 @@ -7131,7 +7131,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -7141,7 +7141,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 @@ -7173,10 +7173,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 @@ -7189,7 +7189,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 @@ -7201,7 +7201,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 @@ -7468,7 +7468,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 @@ -7560,7 +7560,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 ; AVX512-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 @@ -7579,12 +7579,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] ; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 ; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) @@ -7721,7 +7721,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 @@ -7813,7 +7813,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 @@ -7832,12 +7832,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) @@ -7974,7 +7974,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 @@ -8066,7 +8066,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 ; AVX512DQ-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 @@ -8085,12 +8085,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 ; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) @@ -8227,7 +8227,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 @@ -8319,7 +8319,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 @@ -8338,12 +8338,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index e1d80dffc255a..0ee10a33c1d0c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1231,7 +1231,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] @@ -1242,7 +1242,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] @@ -1253,7 +1253,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] @@ -1264,7 +1264,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1286,7 +1286,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] @@ -1297,7 +1297,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] @@ -1308,7 +1308,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] @@ -1319,7 +1319,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1341,7 +1341,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] @@ -1352,7 +1352,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] @@ -1363,7 +1363,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] @@ -1374,7 +1374,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2416,7 +2416,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] ; AVX-NEXT: vpor %xmm10, %xmm9, %xmm10 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm9 = [18446744073709551615,255] ; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] @@ -2517,7 +2517,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] @@ -2531,7 +2531,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] @@ -2543,9 +2543,9 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX2-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 @@ -2556,7 +2556,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero ; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX2-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2567,7 +2567,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero ; AVX2-NEXT: vpor %xmm12, %xmm9, %xmm9 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] @@ -2577,7 +2577,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero ; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] @@ -2588,7 +2588,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero ; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -2615,7 +2615,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] @@ -2629,7 +2629,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] @@ -2641,9 +2641,9 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 @@ -2654,7 +2654,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero ; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2665,7 +2665,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero ; AVX2-FP-NEXT: vpor %xmm12, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] @@ -2675,7 +2675,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero ; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] @@ -2686,7 +2686,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero ; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -2713,7 +2713,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] @@ -2727,7 +2727,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] @@ -2739,9 +2739,9 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 @@ -2752,7 +2752,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero ; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2763,7 +2763,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero ; AVX2-FCP-NEXT: vpor %xmm12, %xmm9, %xmm9 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] @@ -2773,7 +2773,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero ; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] @@ -2784,7 +2784,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero ; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -2836,7 +2836,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX512-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm9, %ymm10 @@ -2932,7 +2932,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX512-FCP-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 @@ -3028,7 +3028,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX512DQ-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 @@ -3124,7 +3124,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 @@ -4779,7 +4779,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX-NEXT: vpor %xmm12, %xmm13, %xmm13 -; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709486080,16777215] ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] @@ -4998,27 +4998,27 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] @@ -5028,7 +5028,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] @@ -5038,7 +5038,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 @@ -5068,7 +5068,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -5081,7 +5081,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -5095,7 +5095,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] @@ -5108,14 +5108,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 ; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 ; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 ; AVX2-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 ; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 @@ -5128,7 +5128,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm13 @@ -5210,27 +5210,27 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] @@ -5240,7 +5240,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] @@ -5250,7 +5250,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 @@ -5280,7 +5280,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -5293,7 +5293,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -5307,7 +5307,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] @@ -5320,14 +5320,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 @@ -5340,7 +5340,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm13 @@ -5423,38 +5423,37 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -5465,7 +5464,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [0,18446744073709551360,16777215,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] @@ -5473,8 +5472,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] @@ -5496,7 +5494,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 @@ -5509,7 +5507,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 @@ -5521,9 +5519,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] @@ -5539,9 +5537,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 @@ -5554,7 +5552,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 @@ -5592,8 +5590,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,3,5,6] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] @@ -5666,7 +5663,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512-NEXT: vpternlogq $226, %ymm12, %ymm16, %ymm8 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512-NEXT: vmovdqa %ymm11, %ymm12 @@ -5705,7 +5702,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm15 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm14 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm8 @@ -5733,7 +5730,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12 ; AVX512-NEXT: vmovdqa %ymm11, %ymm8 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8 @@ -5845,8 +5842,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -5869,7 +5865,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 @@ -5879,8 +5875,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] @@ -5907,7 +5902,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm14 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm13 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 @@ -5915,8 +5910,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] @@ -5936,7 +5930,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6 @@ -6076,7 +6070,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512DQ-NEXT: vpternlogq $226, %ymm12, %ymm16, %ymm8 ; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12 @@ -6115,7 +6109,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm14 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8 @@ -6143,7 +6137,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8 @@ -6255,8 +6249,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -6279,7 +6272,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 @@ -6289,8 +6282,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] @@ -6317,7 +6309,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm14 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm13 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 @@ -6325,8 +6317,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] @@ -6346,7 +6337,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6 @@ -6449,20 +6440,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -6636,20 +6627,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -6674,8 +6665,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6] -; AVX512BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -6702,8 +6692,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6] -; AVX512BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] @@ -6722,8 +6711,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6] -; AVX512BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] @@ -6823,20 +6811,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -7010,20 +6998,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -7048,8 +7036,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -7076,8 +7063,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] @@ -7096,8 +7082,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] @@ -9597,7 +9582,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm13 ; AVX-NEXT: vmovdqa %xmm0, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215] ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 432(%rdi), %xmm1 @@ -10153,7 +10138,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-NEXT: vmovdqa %ymm2, %ymm10 @@ -10166,7 +10151,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 ; AVX2-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-NEXT: vmovdqa %ymm5, %ymm9 @@ -10178,7 +10163,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 @@ -10202,7 +10187,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 ; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 @@ -10214,7 +10199,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 @@ -10230,7 +10215,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10252,7 +10237,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2 @@ -10273,7 +10258,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm6 @@ -10306,7 +10291,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10343,7 +10328,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 @@ -10375,7 +10360,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] @@ -10409,10 +10394,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] @@ -10447,15 +10432,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 ; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10470,7 +10455,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 @@ -10485,7 +10470,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 ; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm15 @@ -10498,7 +10483,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm0 @@ -10692,7 +10677,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 @@ -10705,7 +10690,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9 @@ -10717,7 +10702,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 @@ -10741,7 +10726,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm3 @@ -10753,7 +10738,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] ; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 @@ -10769,7 +10754,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10791,7 +10776,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2 @@ -10812,7 +10797,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm6 @@ -10845,7 +10830,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10882,7 +10867,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 @@ -10914,7 +10899,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] @@ -10948,10 +10933,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] @@ -10986,15 +10971,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11009,7 +10994,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 @@ -11024,7 +11009,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm15 @@ -11037,7 +11022,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm0 @@ -11231,7 +11216,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11243,7 +11228,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11254,7 +11239,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 @@ -11276,11 +11261,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11292,7 +11277,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,18446744073709551360,16777215,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 @@ -11307,7 +11292,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11322,14 +11307,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 @@ -11350,7 +11335,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm4 @@ -11359,7 +11344,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm11 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11 @@ -11377,7 +11362,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -11415,7 +11400,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3 @@ -11450,7 +11435,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 @@ -11484,10 +11469,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] @@ -11523,15 +11508,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11547,7 +11532,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 @@ -11576,7 +11561,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm0 @@ -11685,7 +11670,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -11865,7 +11850,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa %ymm3, %ymm15 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 @@ -11891,7 +11876,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm25 ; AVX512-NEXT: vmovdqa %ymm9, %ymm0 ; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0 @@ -11936,7 +11921,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm1, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm8, %zmm20 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0 ; AVX512-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm0 @@ -12096,7 +12081,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm7, %ymm28 ; AVX512-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -12210,8 +12195,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -12262,8 +12246,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] @@ -12284,7 +12267,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpternlogq $202, %ymm27, %ymm30, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm2 @@ -12293,8 +12276,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] @@ -12305,7 +12287,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm23 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm2 @@ -12352,7 +12334,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm3 @@ -12512,7 +12494,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -12712,7 +12694,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 @@ -12741,7 +12723,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm25 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0 @@ -12785,7 +12767,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm29, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm8, %zmm20 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 ; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm18, %ymm0 @@ -12947,7 +12929,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512DQ-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm2 ; AVX512DQ-NEXT: vpternlogq $226, %ymm21, %ymm7, %ymm16 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -13060,8 +13042,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -13112,8 +13093,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] @@ -13134,7 +13114,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13144,8 +13124,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] @@ -13156,7 +13135,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm2 @@ -13203,7 +13182,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm3 @@ -13364,7 +13343,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm8 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -13448,13 +13427,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i8_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -13751,11 +13730,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] ; AVX512BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] ; AVX512BW-NEXT: vpermw %zmm25, %zmm10, %zmm10 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] @@ -13819,21 +13798,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -13862,8 +13841,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -13916,8 +13894,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] @@ -13940,8 +13917,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] @@ -14032,9 +14008,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2} @@ -14105,11 +14081,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm19, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] @@ -14143,7 +14119,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 @@ -14165,7 +14141,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} @@ -14184,13 +14160,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -14487,11 +14463,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm11, %zmm11 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] @@ -14552,21 +14528,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -14595,8 +14571,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -14649,8 +14624,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] @@ -14673,8 +14647,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] @@ -14765,9 +14738,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1} @@ -14838,11 +14811,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm19, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] @@ -14877,7 +14850,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} @@ -14897,7 +14870,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 7f827326e122e..b1eb4d6af4eb7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -6102,7 +6102,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm0 @@ -6250,7 +6250,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload @@ -6735,7 +6735,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 @@ -6852,7 +6852,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 @@ -7327,7 +7327,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 @@ -7444,7 +7444,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 @@ -7863,7 +7863,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 ; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 @@ -7972,7 +7972,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] ; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 ; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 ; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 @@ -8378,7 +8378,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 @@ -8487,7 +8487,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 @@ -13175,7 +13175,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] @@ -13493,7 +13493,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload @@ -14566,7 +14566,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 @@ -14849,7 +14849,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 @@ -15882,7 +15882,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 @@ -16165,7 +16165,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 @@ -17134,7 +17134,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 @@ -17362,7 +17362,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] @@ -18261,7 +18261,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 @@ -18489,7 +18489,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll index 5d4c9e127727d..a034363895c0e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -322,7 +322,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -332,7 +332,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -342,7 +342,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -352,7 +352,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -507,7 +507,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -517,7 +517,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -527,7 +527,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -537,7 +537,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -790,9 +790,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -803,9 +803,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -816,9 +816,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -829,9 +829,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -1291,10 +1291,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1311,10 +1311,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1331,10 +1331,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1351,10 +1351,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index a7ff13b0f872b..23ddcd7cd0262 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -115,7 +115,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512BW-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512BW-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512BW-NEXT: vmovq %xmm1, (%rcx) @@ -125,7 +125,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512BW-FCP-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) @@ -135,7 +135,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512DQ-BW-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) @@ -145,7 +145,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) @@ -331,7 +331,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) @@ -346,7 +346,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx) @@ -361,7 +361,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx) @@ -376,7 +376,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx) @@ -469,9 +469,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -492,9 +492,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -511,12 +511,12 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -537,7 +537,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -561,9 +561,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,0,0,u,1,1,u,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27] ; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 @@ -583,7 +583,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -607,9 +607,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,0,0,u,1,1,u,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27] ; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 @@ -624,7 +624,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx) @@ -636,7 +636,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) @@ -648,7 +648,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx) @@ -660,7 +660,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) @@ -815,9 +815,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7] @@ -826,18 +826,18 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] ; AVX2-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm2, (%rcx) @@ -860,9 +860,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,1,0,2] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] @@ -870,18 +870,18 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] ; AVX2-FP-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rcx) @@ -904,9 +904,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,1,0,2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] @@ -914,18 +914,18 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] ; AVX2-FCP-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rcx) @@ -952,7 +952,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -964,7 +964,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -991,7 +991,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -1003,7 +1003,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1030,7 +1030,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -1042,7 +1042,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1069,7 +1069,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -1081,7 +1081,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1094,9 +1094,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1108,9 +1108,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1122,9 +1122,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1136,9 +1136,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1434,9 +1434,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [5,5,u,6,6,u,7,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,5,0,6,6,0,7,7] ; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 ; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 @@ -1461,9 +1461,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] ; AVX2-NEXT: vpermd %ymm2, %ymm7, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7] @@ -1476,13 +1476,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,3,3,u,4,4,u,5] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,3,0,4,4,0,5] ; AVX2-NEXT: vpermd (%rdi), %ymm9, %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [2,u,3,3,u,4,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,3,3,0,4,4,0] ; AVX2-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-NEXT: vpermd 32(%rdi), %ymm9, %ymm4 @@ -1513,9 +1513,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm9 @@ -1535,11 +1535,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 @@ -1552,17 +1552,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,3,0,4,4,0,5] ; AVX2-FP-NEXT: vpermd (%rdi), %ymm8, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 ; AVX2-FP-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,3,3,0,4,4,0] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 @@ -1590,9 +1590,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm9 @@ -1612,11 +1612,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 @@ -1629,17 +1629,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,3,0,4,4,0,5] ; AVX2-FCP-NEXT: vpermd (%rdi), %ymm8, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 ; AVX2-FCP-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,3,3,0,4,4,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 @@ -1676,7 +1676,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5 ; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -1718,7 +1718,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm7, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1754,7 +1754,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -1796,7 +1796,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1832,7 +1832,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512DQ-NEXT: vpermd (%rdx), %zmm4, %zmm5 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -1874,7 +1874,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm7, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1910,7 +1910,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -1952,7 +1952,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1969,17 +1969,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1992,17 +1992,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -2015,17 +2015,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -2038,17 +2038,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -2668,9 +2668,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [5,5,u,6,6,u,7,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,5,0,6,6,0,7,7] ; AVX2-NEXT: vpermd %ymm3, %ymm8, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 @@ -2718,9 +2718,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,0,0,u,1,1,u,2] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,1,0,2] ; AVX2-NEXT: vpermd %ymm4, %ymm12, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm9 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] @@ -2758,9 +2758,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,3,3,u,4,4,u,5] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,3,3,0,4,4,0,5] ; AVX2-NEXT: vpermd (%rdi), %ymm12, %ymm13 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm13 ; AVX2-NEXT: vpshufb %ymm2, %ymm13, %ymm13 @@ -2774,9 +2774,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,3,3,u,4,4,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,3,3,0,4,4,0] ; AVX2-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpermd %ymm3, %ymm12, %ymm3 ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 @@ -2817,9 +2817,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,1,0,2] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm13, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm4 @@ -2831,9 +2831,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm5 @@ -2902,9 +2902,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] ; AVX2-FP-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm9 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm9, %ymm9 @@ -2918,9 +2918,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 ; AVX2-FP-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 @@ -2961,9 +2961,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,1,0,2] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 @@ -2975,9 +2975,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm5 @@ -3046,9 +3046,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] ; AVX2-FCP-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm9 @@ -3062,9 +3062,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 @@ -3121,7 +3121,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3146,7 +3146,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 @@ -3273,7 +3273,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3298,7 +3298,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 @@ -3425,7 +3425,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3450,7 +3450,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512DQ-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 @@ -3577,7 +3577,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3602,7 +3602,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] ; AVX512DQ-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 @@ -3706,20 +3706,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3744,20 +3744,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3782,20 +3782,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3820,20 +3820,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index 587f0cb26e1cf..68b180ef52565 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -123,7 +123,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vmovdqa %xmm2, (%r8) ; AVX512BW-NEXT: retq @@ -134,7 +134,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8) ; AVX512BW-FCP-NEXT: retq @@ -145,7 +145,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r8) ; AVX512DQ-BW-NEXT: retq @@ -156,7 +156,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: retq @@ -252,7 +252,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -285,7 +285,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -318,7 +318,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -334,7 +334,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -349,7 +349,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -364,7 +364,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -379,7 +379,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -452,7 +452,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -475,7 +475,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -498,7 +498,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -521,7 +521,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -544,7 +544,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -567,7 +567,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -590,7 +590,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -610,7 +610,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -623,7 +623,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -636,7 +636,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -649,7 +649,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -885,7 +885,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[3],ymm4[4],ymm9[5],ymm4[6],ymm9[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -932,7 +932,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -962,7 +962,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -992,7 +992,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1022,7 +1022,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1042,9 +1042,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1057,9 +1057,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1072,9 +1072,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1087,9 +1087,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1518,7 +1518,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4],ymm12[5],ymm2[6],ymm12[7] ; AVX2-FCP-NEXT: vmovdqa 48(%rcx), %xmm12 @@ -1720,7 +1720,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vpermd %zmm11, %zmm13, %zmm1 {%k1} @@ -1899,7 +1899,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm13, %zmm1 {%k1} @@ -1955,26 +1955,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1990,26 +1990,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2025,26 +2025,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2060,26 +2060,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2949,7 +2949,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -3340,7 +3340,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vpermd %zmm8, %zmm5, %zmm18 {%k1} @@ -3685,7 +3685,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm5, %zmm18 {%k1} @@ -3819,32 +3819,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3880,32 +3880,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3941,32 +3941,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -4002,32 +4002,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index ca3cd2a11b666..c1e7f1e8c6c72 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -71,7 +71,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,65535,65535,65535,0,65535,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovd %xmm1, 16(%r9) @@ -195,7 +195,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovd %xmm1, 16(%r9) @@ -211,7 +211,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovd %xmm1, 16(%r9) @@ -227,7 +227,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovd %xmm1, 16(%r9) @@ -243,7 +243,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, 16(%r9) @@ -360,7 +360,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] @@ -387,7 +387,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] @@ -413,7 +413,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] @@ -541,7 +541,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 32(%r9) @@ -560,7 +560,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm1, 32(%r9) @@ -579,7 +579,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, 32(%r9) @@ -598,7 +598,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 32(%r9) @@ -784,10 +784,10 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero @@ -795,7 +795,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] ; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] @@ -828,10 +828,10 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,6,7,10,11,u,u,u,u,u,u,8,9,u,u,u,u,22,23,26,27,u,u,u,u,u,u,24,25] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6],ymm9[7],ymm8[8,9],ymm9[10,11],ymm8[12,13,14],ymm9[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero @@ -839,7 +839,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] ; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -863,15 +863,15 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,5,2,6,2,6,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,2,6,0,0] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29] ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero @@ -879,7 +879,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] ; AVX2-FCP-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -951,13 +951,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 @@ -1035,13 +1035,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 @@ -1070,9 +1070,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1087,9 +1087,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1104,9 +1104,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1121,9 +1121,9 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1483,10 +1483,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,2,5,5,5,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] @@ -1497,10 +1497,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1510,10 +1510,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] @@ -1525,10 +1525,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1536,10 +1536,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-NEXT: vmovdqa %ymm8, 128(%r9) @@ -1566,10 +1566,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 ; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1579,10 +1579,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] @@ -1592,10 +1592,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] @@ -1605,10 +1605,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1616,10 +1616,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9) @@ -1646,10 +1646,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 ; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 ; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1659,10 +1659,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] @@ -1672,10 +1672,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] @@ -1685,10 +1685,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1696,10 +1696,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9) @@ -2022,19 +2022,19 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512BW-NEXT: vzeroupper @@ -2047,19 +2047,19 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2072,19 +2072,19 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2097,19 +2097,19 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -2847,10 +2847,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -2874,10 +2874,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] @@ -2904,7 +2904,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 @@ -2924,7 +2924,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] ; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 ; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 @@ -2940,7 +2940,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpshufb %ymm0, %ymm13, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] @@ -2954,7 +2954,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 ; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm1 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 @@ -2966,7 +2966,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] @@ -2977,7 +2977,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,2,2] @@ -3025,10 +3025,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm12 @@ -3050,13 +3050,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -3082,7 +3082,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 @@ -3097,7 +3097,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 @@ -3113,7 +3113,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] @@ -3125,7 +3125,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 @@ -3137,7 +3137,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] @@ -3148,7 +3148,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] @@ -3195,10 +3195,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm12 @@ -3220,13 +3220,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -3252,7 +3252,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 @@ -3267,7 +3267,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 @@ -3283,7 +3283,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] @@ -3295,7 +3295,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 @@ -3307,7 +3307,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] @@ -3318,7 +3318,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] @@ -3550,7 +3550,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm1 ; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 @@ -3607,7 +3607,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vprolq $16, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 @@ -3616,7 +3616,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 @@ -3629,7 +3629,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 @@ -3639,7 +3639,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 @@ -3864,7 +3864,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 @@ -3921,7 +3921,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 @@ -3930,7 +3930,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 @@ -3943,7 +3943,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 @@ -3953,7 +3953,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 @@ -3975,46 +3975,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4031,46 +4031,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4087,46 +4087,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4143,46 +4143,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -5655,7 +5655,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 @@ -5686,7 +5686,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 @@ -5708,7 +5708,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm5 @@ -5752,7 +5752,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] @@ -5779,7 +5779,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5835,7 +5835,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm9 @@ -5860,7 +5860,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] @@ -5904,7 +5904,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] ; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 ; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm9 ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 @@ -5925,7 +5925,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = mem[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm9 @@ -5956,7 +5956,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[1,1,2,2] @@ -6031,7 +6031,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] @@ -6060,7 +6060,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 32(%r8), %ymm10 @@ -6082,7 +6082,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 ; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm2 @@ -6126,7 +6126,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] @@ -6154,7 +6154,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6206,7 +6206,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm13 @@ -6232,7 +6232,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -6270,7 +6270,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm10 @@ -6293,7 +6293,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] @@ -6324,7 +6324,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] @@ -6399,7 +6399,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] @@ -6428,7 +6428,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 32(%r8), %ymm10 @@ -6450,7 +6450,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm2 @@ -6494,7 +6494,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] @@ -6522,7 +6522,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6574,7 +6574,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm13 @@ -6600,7 +6600,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -6638,7 +6638,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm10 @@ -6661,7 +6661,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] @@ -6692,7 +6692,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] @@ -7170,7 +7170,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] @@ -7223,7 +7223,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,2,3,3,7,6,7,7] @@ -7231,7 +7231,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 ; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 @@ -7291,7 +7291,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -7302,7 +7302,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10],ymm12[11],ymm8[12,13],ymm12[14],ymm8[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 @@ -7797,7 +7797,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] @@ -7850,7 +7850,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,3,2,3,10,11,10,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,2,3,3,7,6,7,7] @@ -7858,7 +7858,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 @@ -7918,7 +7918,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -7929,7 +7929,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10],ymm12[11],ymm8[12,13],ymm12[14],ymm8[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 @@ -8008,56 +8008,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8104,56 +8104,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8200,56 +8200,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8296,56 +8296,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 70b9c79f393f8..824bd6e023c79 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -228,7 +228,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rax) @@ -247,7 +247,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rax) @@ -266,7 +266,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rax) @@ -285,7 +285,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rax) @@ -531,7 +531,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] @@ -598,7 +598,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] @@ -625,7 +625,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) @@ -646,7 +646,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) @@ -667,7 +667,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) @@ -688,7 +688,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) @@ -1151,9 +1151,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1170,9 +1170,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1189,9 +1189,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1208,9 +1208,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1589,7 +1589,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -1624,7 +1624,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -1650,7 +1650,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] @@ -1705,7 +1705,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm9 @@ -1740,7 +1740,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] @@ -1771,7 +1771,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 @@ -1831,7 +1831,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] @@ -1853,8 +1853,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] -; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,0,0,2,0,0,3,0] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] @@ -1864,11 +1863,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,0,7,6,5,0,7,6] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,0,0,6,0,0,7,0] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -1891,11 +1889,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,5,0,0,6] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -1935,7 +1932,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,5,12,u,4,13,u,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] ; AVX512-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] @@ -1944,10 +1941,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [5,u,14,6,u,15,7,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] ; AVX512-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX512-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 @@ -1957,7 +1954,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,1,9,u,2,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] ; AVX512-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 ; AVX512-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-NEXT: vmovdqa (%r8), %xmm11 @@ -1972,7 +1969,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] @@ -1982,17 +1979,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,u,5,13,u,6,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] ; AVX512-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,10,2,u,11,3,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] ; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -2016,12 +2013,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,8,u,1,9,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] ; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 @@ -2030,10 +2027,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] @@ -2041,17 +2038,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,u,10,2,u,11,3,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,12,u,5,13,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 @@ -2063,19 +2060,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,5,12,u,4,13,u,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,u,14,6,u,15,7,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 @@ -2102,7 +2099,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,5,12,u,4,13,u,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] @@ -2111,10 +2108,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [5,u,14,6,u,15,7,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] ; AVX512DQ-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 @@ -2124,7 +2121,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,1,9,u,2,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] ; AVX512DQ-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 @@ -2139,7 +2136,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512DQ-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] @@ -2149,17 +2146,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,u,5,13,u,6,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] ; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,10,2,u,11,3,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -2183,12 +2180,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,8,u,1,9,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 @@ -2197,10 +2194,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] @@ -2208,17 +2205,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,u,10,2,u,11,3,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,12,u,5,13,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 @@ -2230,19 +2227,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,5,12,u,4,13,u,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,u,14,6,u,15,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 @@ -2261,17 +2258,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2288,17 +2285,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2315,17 +2312,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2342,17 +2339,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -3093,7 +3090,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -3197,7 +3194,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 ; AVX2-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3269,7 +3266,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -3378,7 +3375,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 @@ -3478,7 +3475,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -3533,7 +3530,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -3547,7 +3544,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm0, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3656,7 +3653,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm1 @@ -3746,7 +3743,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,1,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,2,1,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] @@ -3758,7 +3755,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3781,7 +3778,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -3828,7 +3825,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3853,7 +3850,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] @@ -3988,7 +3985,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] ; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqa (%rsi), %xmm11 ; AVX512-NEXT: vmovdqa (%rdi), %xmm15 @@ -4043,7 +4040,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] @@ -4141,12 +4138,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm12 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -4156,7 +4153,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm3 @@ -4164,7 +4161,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm4 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm17, %ymm1 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 @@ -4174,7 +4171,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,2,u,3,10,u,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,2,0,3,10,0,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9 @@ -4222,7 +4219,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm12 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm12 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [1,0,2,2,1,0,2,2] @@ -4237,13 +4234,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm22, %ymm12 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm1[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 @@ -4253,7 +4250,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,0,0,u,8,8,u,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,8,8,0,9] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm12 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm4 @@ -4280,23 +4277,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[8],ymm9[8],ymm2[9],ymm9[9],ymm2[10],ymm9[10],ymm2[11],ymm9[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] ; AVX512-FCP-NEXT: vpermd %zmm18, %zmm7, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11] ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm21, %ymm18 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,u,0,1,u,10,10,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,0,10,10,0] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm9 @@ -4350,7 +4347,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 @@ -4496,7 +4493,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 @@ -4589,7 +4586,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 @@ -4604,12 +4601,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm19 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 @@ -4626,10 +4623,10 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm6, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,u,0,1,u,10,10,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,0,10,10,0] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4662,9 +4659,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm21, %ymm1 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -4672,14 +4669,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm22, %ymm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm9[0,1,2,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 @@ -4693,7 +4690,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,u,3,10,u,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,0,3,10,0,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 @@ -4719,7 +4716,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm4 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] @@ -4734,7 +4731,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm6, %zmm9 @@ -4746,7 +4743,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm5, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm9[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -4762,7 +4759,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm4 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,0,0,u,8,8,u,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,8,8,0,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm11 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] @@ -4813,9 +4810,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4826,9 +4823,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4837,9 +4834,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4848,9 +4845,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4859,9 +4856,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4870,9 +4867,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -4901,9 +4898,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4914,9 +4911,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4925,9 +4922,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4936,9 +4933,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4947,9 +4944,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -4958,9 +4955,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -4989,9 +4986,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5002,9 +4999,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5013,9 +5010,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5024,9 +5021,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5035,9 +5032,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5046,9 +5043,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -5077,9 +5074,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5090,9 +5087,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5101,9 +5098,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5112,9 +5109,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5123,9 +5120,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -5134,9 +5131,9 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -6625,7 +6622,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -6854,7 +6851,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] @@ -7000,7 +6997,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm2 = mem[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7212,7 +7209,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -7403,7 +7400,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 @@ -7574,7 +7571,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7785,7 +7782,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 @@ -7998,7 +7995,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload @@ -8055,7 +8052,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -8123,7 +8120,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,1,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,1,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -8136,7 +8133,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8197,7 +8194,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] @@ -8478,7 +8475,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] ; AVX512-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] @@ -8535,7 +8532,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-NEXT: vpermt2d %zmm3, %zmm31, %zmm5 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] @@ -8836,10 +8833,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm8 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm26, %ymm8 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -8850,7 +8847,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8858,12 +8855,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm8 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm29, %ymm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,u,3,10,u,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [2,2,0,3,10,0,10,11] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] @@ -8913,7 +8910,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm4 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] @@ -8928,18 +8925,18 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 ; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm16, %ymm4 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,0,0,u,8,8,u,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,8,8,0,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] @@ -9099,24 +9096,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm12 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11] ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm14, %ymm2 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,u,0,1,u,10,10,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,0,10,10,0] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload @@ -9230,7 +9227,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm4 ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill @@ -9535,7 +9532,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] @@ -9777,7 +9774,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 @@ -9789,25 +9786,25 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm22 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11] ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm19, %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,u,0,1,u,10,10,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,0,10,10,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] @@ -9904,9 +9901,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] @@ -9916,14 +9913,14 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm30, %ymm8 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9934,7 +9931,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,2,u,3,10,u,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm2 @@ -9967,7 +9964,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] @@ -9982,13 +9979,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm6 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm16, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9997,7 +9994,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,0,0,u,8,8,u,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm3 @@ -10259,22 +10256,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 @@ -10285,21 +10282,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 @@ -10401,22 +10398,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 @@ -10427,21 +10424,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 @@ -10543,22 +10540,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 @@ -10569,21 +10566,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 @@ -10685,22 +10682,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 @@ -10711,21 +10708,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 8d945bdd7a9aa..79cc8e49f1fdb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -264,7 +264,7 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -285,7 +285,7 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -306,7 +306,7 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -327,7 +327,7 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -509,7 +509,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero @@ -517,9 +517,9 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -553,16 +553,16 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -588,7 +588,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,7,1,3,7,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] @@ -604,7 +604,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 @@ -669,7 +669,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] @@ -684,7 +684,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] ; AVX512-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,7,1,3,7,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 @@ -752,7 +752,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] @@ -767,7 +767,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] ; AVX512DQ-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,7,1,3,7,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 @@ -795,7 +795,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -819,10 +819,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -847,7 +847,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -871,10 +871,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -1163,14 +1163,14 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero @@ -1180,9 +1180,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] ; AVX2-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1191,9 +1191,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] ; AVX2-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] @@ -1231,14 +1231,14 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero @@ -1248,9 +1248,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] ; AVX2-FP-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1259,9 +1259,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] @@ -1295,16 +1295,16 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,5,u,u,5,2,6,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX2-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero @@ -1314,9 +1314,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] ; AVX2-FCP-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1325,9 +1325,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] @@ -1451,7 +1451,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,5,u,u,5,2,6,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1576,7 +1576,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,5,u,u,5,2,6,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1608,9 +1608,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1630,9 +1630,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1652,9 +1652,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1674,9 +1674,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -2295,34 +2295,34 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] ; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] ; AVX2-NEXT: vpermd %ymm3, %ymm8, %ymm8 ; AVX2-NEXT: vmovdqa %ymm3, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rcx), %xmm10 @@ -2336,7 +2336,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa (%r9), %xmm9 ; AVX2-NEXT: vmovdqa (%r8), %xmm14 @@ -2344,9 +2344,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] @@ -2358,16 +2358,16 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm7, %ymm8 @@ -2383,7 +2383,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] @@ -2392,12 +2392,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] @@ -2406,15 +2406,15 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -2427,7 +2427,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 ; AVX2-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] @@ -2439,9 +2439,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -2453,7 +2453,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] @@ -2462,9 +2462,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -2494,30 +2494,30 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm8 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10 @@ -2530,7 +2530,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm9 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm14 @@ -2538,9 +2538,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] @@ -2551,15 +2551,15 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] @@ -2573,7 +2573,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] @@ -2581,12 +2581,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] @@ -2594,14 +2594,14 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -2611,7 +2611,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] @@ -2621,9 +2621,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] @@ -2633,7 +2633,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] @@ -2641,9 +2641,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -2679,17 +2679,17 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] @@ -2700,42 +2700,42 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[0,0,2,1,4,4,6,5] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10 @@ -2748,7 +2748,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm8 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 @@ -2756,9 +2756,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] @@ -2768,15 +2768,15 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] @@ -2786,17 +2786,17 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] @@ -2804,14 +2804,14 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2876,8 +2876,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0] ; AVX512-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] @@ -2901,7 +2900,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 ; AVX512-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] @@ -2932,8 +2931,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd %zmm13, %zmm3, %zmm3 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 @@ -2993,7 +2991,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,u,3,2,u,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -3009,15 +3007,14 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 ; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] @@ -3038,7 +3035,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] ; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] @@ -3052,7 +3049,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,u,u,u,7,u,u,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] @@ -3141,8 +3138,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,1,17,17,2,0,0,16,0,1,17,17,2,0] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0] ; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] @@ -3166,7 +3162,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 ; AVX512DQ-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] @@ -3197,8 +3193,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd %zmm13, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 @@ -3258,7 +3253,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,u,3,2,u,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -3274,15 +3269,14 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] @@ -3303,7 +3297,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] @@ -3317,7 +3311,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,u,u,u,7,u,u,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] @@ -3369,27 +3363,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -3415,27 +3409,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3461,27 +3455,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -3507,27 +3501,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4806,13 +4800,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9 ; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,3,u,u,u,4,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0] ; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm4 ; AVX2-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm0 @@ -4821,22 +4815,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,3,u,u,u,4,u] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4] ; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa (%r8), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4848,16 +4842,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [3,u,u,3,u,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,3,0,0,0,4] ; AVX2-NEXT: vpermd %ymm6, %ymm4, %ymm5 ; AVX2-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 @@ -4875,22 +4869,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -4903,18 +4897,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -4930,7 +4924,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4954,7 +4948,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: vpbroadcastd 32(%rax), %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 ; AVX2-NEXT: vmovdqa (%r9), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4966,7 +4960,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-NEXT: vpbroadcastd (%rax), %ymm14 ; AVX2-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -4981,7 +4975,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] @@ -4998,14 +4992,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5016,7 +5010,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] @@ -5033,7 +5027,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -5042,7 +5036,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -5059,7 +5053,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] @@ -5071,9 +5065,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] @@ -5084,7 +5078,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] @@ -5093,9 +5087,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] @@ -5111,7 +5105,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -5134,7 +5128,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] @@ -5144,7 +5138,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5188,13 +5182,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,3,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,3,0,0,0,4,0,0] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm5 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] @@ -5202,21 +5196,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm11 ; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm3, %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm11, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5227,16 +5221,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,3,0,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm5 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 @@ -5253,21 +5247,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] @@ -5278,18 +5272,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 @@ -5307,7 +5301,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm14 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm13 @@ -5329,7 +5323,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5339,7 +5333,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 @@ -5356,7 +5350,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] @@ -5371,13 +5365,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5388,7 +5382,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 @@ -5404,7 +5398,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] @@ -5412,7 +5406,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -5427,7 +5421,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] @@ -5438,9 +5432,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] @@ -5450,7 +5444,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7] @@ -5458,9 +5452,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] @@ -5476,7 +5470,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] @@ -5497,7 +5491,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] @@ -5506,7 +5500,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5558,9 +5552,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5569,19 +5563,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5598,9 +5592,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 @@ -5617,7 +5611,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] @@ -5636,9 +5630,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -5648,51 +5642,51 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,u,u,3,u,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,3,0,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 @@ -5705,19 +5699,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] @@ -5728,18 +5722,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 @@ -5759,7 +5753,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 @@ -5781,7 +5775,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5791,7 +5785,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm11 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 @@ -5810,7 +5804,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] @@ -5825,13 +5819,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -5841,7 +5835,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 @@ -5857,7 +5851,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] @@ -5865,7 +5859,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5935,7 +5929,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5943,7 +5937,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -5972,7 +5966,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %xmm1 @@ -5980,7 +5974,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vprold $16, %ymm10, %ymm4 @@ -5990,7 +5984,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] @@ -6077,8 +6071,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512-NEXT: vmovdqa 32(%rax), %ymm6 ; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7 @@ -6214,8 +6207,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd (%rax), %zmm2, %zmm2 ; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 @@ -6291,7 +6283,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm30 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,2,2,3,10,u,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] @@ -6300,7 +6292,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,2,2,3,8,u,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7] @@ -6310,12 +6302,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 @@ -6331,14 +6323,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,0,u,1,8,8,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm27 ; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7] @@ -6347,7 +6339,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm28 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm14 @@ -6357,7 +6349,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,0,1,1,12,13,u,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm25 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm2 @@ -6379,11 +6371,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,u,3,10,10,11,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm21, %ymm2 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 @@ -6396,7 +6388,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm13 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7] @@ -6404,7 +6396,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm13 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm6 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] @@ -6412,7 +6404,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -6421,7 +6413,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512-FCP-NEXT: vprold $16, %xmm14, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3] @@ -6451,7 +6443,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm6 @@ -6482,7 +6474,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm14 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [6,u,u,u,7,u,u,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7] ; AVX512-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpermd %zmm3, %zmm21, %zmm3 @@ -6555,7 +6547,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6563,7 +6555,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -6592,7 +6584,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 @@ -6600,7 +6592,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vprold $16, %ymm10, %ymm4 @@ -6610,7 +6602,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8,9,10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] @@ -6697,8 +6689,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8,9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm6 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm7 @@ -6834,8 +6825,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd (%rax), %zmm2, %zmm2 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 @@ -6911,7 +6901,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm30 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,2,2,3,10,u,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] @@ -6920,7 +6910,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,2,2,3,8,u,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7] @@ -6930,12 +6920,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 @@ -6951,14 +6941,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,0,u,1,8,8,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,2,2,3,5,6,6,7] @@ -6967,7 +6957,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm14 @@ -6977,7 +6967,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,0,1,1,12,13,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,12,13,0,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm25 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm2 @@ -6999,11 +6989,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,u,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,0,3,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm21, %ymm2 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 @@ -7016,7 +7006,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[3,3,3,3,7,7,7,7] @@ -7024,7 +7014,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm6 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] @@ -7032,7 +7022,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -7041,7 +7031,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm14, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3] @@ -7071,7 +7061,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm6 @@ -7102,7 +7092,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm14 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [6,u,u,u,7,u,u,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [6,0,0,0,7,0,0,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm21, %zmm3 @@ -7163,7 +7153,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -7177,7 +7167,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -7194,7 +7184,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k3 @@ -7211,7 +7201,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k3 @@ -7226,7 +7216,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -7241,7 +7231,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -7258,7 +7248,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -7299,7 +7289,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -7313,7 +7303,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -7330,7 +7320,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7347,7 +7337,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7362,7 +7352,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7377,7 +7367,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7394,7 +7384,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7435,7 +7425,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -7449,7 +7439,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -7466,7 +7456,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 @@ -7483,7 +7473,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 @@ -7498,7 +7488,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -7513,7 +7503,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -7530,7 +7520,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -7571,7 +7561,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -7585,7 +7575,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -7602,7 +7592,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7619,7 +7609,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7634,7 +7624,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7649,7 +7639,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7666,7 +7656,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -10287,14 +10277,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rax), %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,3,u,u,u,4,u] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,0,4] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [3,u,u,3,u,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [3,0,0,3,0,0,0,4] ; AVX2-NEXT: vpermd %ymm3, %ymm11, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] @@ -10304,7 +10294,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpermd %ymm9, %ymm10, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 ; AVX2-NEXT: vpermd %ymm7, %ymm10, %ymm0 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1 @@ -10316,10 +10306,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [3,u,u,u,4,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,0,4,0,0,4] ; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-NEXT: vmovdqa (%rcx), %ymm3 @@ -10328,15 +10318,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] ; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa %ymm4, %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm2 @@ -10395,26 +10385,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%r8), %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%r9), %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-NEXT: vmovdqa %ymm2, %ymm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rax), %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -10429,22 +10419,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -10457,18 +10447,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 @@ -10488,7 +10478,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10550,7 +10540,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10583,7 +10573,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-NEXT: vpbroadcastd 96(%rax), %ymm14 ; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload @@ -10606,7 +10596,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -10653,7 +10643,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6] @@ -10671,7 +10661,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 100(%rax), %ymm8 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload @@ -10690,7 +10680,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10729,7 +10719,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload @@ -10753,7 +10743,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 @@ -10762,34 +10752,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,3,u,u,u,4,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,0,4,0,0] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -10806,7 +10796,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -10851,7 +10841,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] @@ -10875,7 +10865,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload @@ -10897,7 +10887,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10926,7 +10916,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] @@ -10937,7 +10927,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] @@ -10958,7 +10948,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 @@ -10977,7 +10967,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] @@ -11016,7 +11006,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -11037,7 +11027,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 @@ -11109,34 +11099,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm5 ; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,u,u,u,4,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,0,0,0,4,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,3,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,0,0,0,4,0,0] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm10, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%r9), %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 @@ -11188,10 +11178,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,3,0,4] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm12, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11210,9 +11200,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11222,7 +11212,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1 @@ -11240,20 +11230,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -11264,19 +11254,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] @@ -11287,18 +11277,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 @@ -11318,7 +11308,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm3 @@ -11377,7 +11367,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11406,7 +11396,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd 96(%rax), %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11428,7 +11418,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] @@ -11470,7 +11460,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm9, %xmm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] @@ -11484,7 +11474,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 100(%rax), %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 @@ -11504,7 +11494,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] @@ -11541,7 +11531,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload @@ -11565,7 +11555,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 @@ -11588,7 +11578,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11629,7 +11619,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 @@ -11651,7 +11641,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload @@ -11665,7 +11655,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [151522058,0,421010202,421010202] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm8 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -11673,7 +11663,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -11708,7 +11698,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm12 @@ -11727,7 +11717,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 @@ -11739,14 +11729,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218894094,0,488382238,488382238] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm11 @@ -11783,7 +11773,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm15 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11801,7 +11791,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm0, %ymm0 @@ -11874,33 +11864,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,0,0,0,4,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm3 @@ -11913,11 +11903,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11963,10 +11953,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 @@ -12016,11 +12006,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -12047,7 +12037,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [151522058,0,421010202,421010202] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm0 @@ -12056,9 +12046,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -12073,7 +12063,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12099,9 +12089,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 @@ -12124,7 +12114,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [218894094,0,488382238,488382238] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -12132,9 +12122,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload @@ -12166,11 +12156,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,0,4,0,0,4] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 @@ -12179,12 +12169,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,3,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,0,0,0,4,0,0] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm6 @@ -12193,16 +12183,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,3,0,4] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 @@ -12213,15 +12203,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,u,u,3,u,u,u,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,0,0,3,0,0,0,4] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0 @@ -12240,22 +12230,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -12266,19 +12256,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] @@ -12289,18 +12279,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 @@ -12322,7 +12312,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12382,7 +12372,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12411,7 +12401,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 96(%rax), %ymm14 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -12432,7 +12422,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm0 @@ -12477,7 +12467,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] @@ -12491,7 +12481,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 100(%rax), %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 @@ -12507,7 +12497,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload @@ -12544,7 +12534,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload @@ -12568,7 +12558,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 104(%rax), %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3 @@ -12719,7 +12709,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] ; AVX512-NEXT: vpermi2q %zmm10, %zmm12, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] @@ -12781,8 +12771,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512-NEXT: vmovdqa 96(%rax), %ymm7 ; AVX512-NEXT: vpermd %zmm7, %zmm18, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12880,10 +12869,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10] ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd 64(%rax), %zmm11, %zmm0 ; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm26, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12901,7 +12889,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] ; AVX512-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512-NEXT: vmovdqa 96(%r8), %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -12941,7 +12929,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] ; AVX512-NEXT: vpermt2d %zmm2, %zmm29, %zmm1 ; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm3 @@ -12958,7 +12946,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] ; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 64(%rsi), %xmm4 @@ -12982,7 +12970,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm1 ; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5 @@ -13440,7 +13428,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $248, %ymm12, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,4,u,u,u,5,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0] ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpternlogq $184, %ymm0, %ymm17, %ymm3 @@ -13453,7 +13441,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm0 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpandnq %ymm1, %ymm25, %ymm1 @@ -13467,14 +13455,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm27 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7] @@ -13485,18 +13473,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm16 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,u,u,u,6,u,u,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31] ; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] @@ -13515,7 +13502,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vpternlogq $248, %zmm23, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13539,7 +13526,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,u,3,10,10,11,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13575,11 +13562,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vprold $16, %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm19 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15] ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm19, %zmm2 @@ -13613,11 +13599,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpternlogd $184, %zmm3, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 @@ -13633,7 +13618,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7] @@ -13645,7 +13630,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm1 @@ -13663,7 +13648,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-FCP-NEXT: vmovdqa %xmm7, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 @@ -13676,7 +13661,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm5, %zmm1 @@ -13687,7 +13672,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13740,7 +13725,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -13762,9 +13747,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm18 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm7 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] @@ -13772,8 +13757,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm19 ; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm25, %zmm19 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 @@ -13786,7 +13770,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7] ; AVX512-FCP-NEXT: vpermd %ymm30, %ymm31, %ymm1 ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 @@ -13807,10 +13791,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -13832,7 +13816,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] @@ -13853,7 +13837,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10 ; AVX512-FCP-NEXT: vpermd %ymm14, %ymm31, %ymm9 ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm18 @@ -13917,7 +13901,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm8 = mem[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero @@ -14066,7 +14050,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,1,3,2,10,10,10,11] ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm12, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] @@ -14128,8 +14112,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm7 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm18, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14227,10 +14210,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,9,11,10] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd 64(%rax), %zmm11, %zmm0 ; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm26, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14248,7 +14230,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] ; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -14288,7 +14270,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm29, %zmm1 ; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm3 @@ -14305,7 +14287,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm4 @@ -14329,7 +14311,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm1 ; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5 @@ -14787,7 +14769,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm12, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,4,u,u,u,5,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,0,0,0,5,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm0, %ymm17, %ymm3 @@ -14800,7 +14782,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpandnq %ymm1, %ymm25, %ymm1 @@ -14814,14 +14796,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151522058,0,421010202,421010202] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm27 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7] @@ -14832,18 +14814,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm16 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7,8,9],ymm9[10],ymm7[11,12],ymm9[13],ymm7[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,u,u,u,6,u,u,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] @@ -14862,7 +14843,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,1,1,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm23, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14886,7 +14867,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,u,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,0,3,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14922,11 +14903,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vprold $16, %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,1,3,2,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm19, %zmm2 @@ -14960,11 +14940,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm3, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 @@ -14980,7 +14959,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,2,2,3,8,9,9,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[3,3,3,3,7,7,7,7] @@ -14992,7 +14971,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm1 @@ -15010,7 +14989,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,1,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 @@ -15023,7 +15002,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm5, %zmm1 @@ -15034,7 +15013,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,9,9,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -15087,7 +15066,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -15109,9 +15088,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm18 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,9,11,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm7 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] @@ -15119,8 +15098,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm19 ; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm25, %zmm19 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 @@ -15133,7 +15111,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm31 = [6,7,3,3,7,7,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm30, %ymm31, %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 @@ -15154,10 +15132,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,2,1,8,8,9,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -15179,7 +15157,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] @@ -15200,7 +15178,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm10 ; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm31, %ymm9 ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm18 @@ -15264,7 +15242,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm8 = mem[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero @@ -15349,7 +15327,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] @@ -15363,7 +15341,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] @@ -15395,7 +15373,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} @@ -15419,7 +15397,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %eax, %k3 @@ -15439,7 +15417,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -15447,7 +15425,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -15468,7 +15446,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -15488,7 +15466,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -15510,7 +15488,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} @@ -15518,15 +15496,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) @@ -15575,7 +15553,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] @@ -15589,7 +15567,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] @@ -15621,7 +15599,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} @@ -15645,7 +15623,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 @@ -15665,7 +15643,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -15673,7 +15651,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -15694,7 +15672,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -15714,7 +15692,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -15736,7 +15714,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} @@ -15744,15 +15722,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) @@ -15801,7 +15779,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] @@ -15815,7 +15793,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] @@ -15847,7 +15825,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} @@ -15871,7 +15849,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 @@ -15891,7 +15869,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512DQ-BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -15899,7 +15877,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512DQ-BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -15920,7 +15898,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -15940,7 +15918,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -15962,7 +15940,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} @@ -15970,15 +15948,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rax) @@ -16027,7 +16005,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] @@ -16041,7 +16019,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] @@ -16073,7 +16051,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} @@ -16097,7 +16075,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 @@ -16117,7 +16095,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -16125,7 +16103,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -16146,7 +16124,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -16166,7 +16144,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -16188,7 +16166,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} @@ -16196,15 +16174,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 15fc795b13122..194b715b6594a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -240,7 +240,7 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -261,7 +261,7 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -282,7 +282,7 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -303,7 +303,7 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -789,7 +789,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -815,7 +815,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -841,7 +841,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -1031,25 +1031,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] ; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] ; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1095,25 +1095,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1159,25 +1159,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1223,25 +1223,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] @@ -1287,25 +1287,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] @@ -1351,25 +1351,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] @@ -1415,25 +1415,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] @@ -1477,9 +1477,9 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1501,9 +1501,9 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1525,9 +1525,9 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1549,9 +1549,9 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -2231,32 +2231,32 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm4 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm6 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm12 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm14 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] @@ -2279,57 +2279,57 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%r10), %ymm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm15 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2416,16 +2416,16 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] ; AVX512-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] ; AVX512-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512-NEXT: movb $-86, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] ; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 @@ -2500,16 +2500,16 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512-FCP-NEXT: movb $-86, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 @@ -2584,16 +2584,16 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512DQ-NEXT: movb $-86, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 @@ -2668,16 +2668,16 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,16,0,0,1,17,10,10,10,26,0,0,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,1,0,1,17,0,0,10,26,11,11,11,27,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: movb $-86, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 @@ -2706,26 +2706,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-86, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2748,26 +2748,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-86, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2790,26 +2790,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-86, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2832,26 +2832,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -4174,7 +4174,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4188,7 +4188,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4199,17 +4199,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] @@ -4234,11 +4234,11 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -4246,16 +4246,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -4302,10 +4302,10 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 @@ -4313,23 +4313,23 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm2, %ymm11 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] @@ -4337,23 +4337,23 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 @@ -4380,29 +4380,29 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] @@ -4410,7 +4410,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] @@ -4460,9 +4460,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] ; AVX512-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-NEXT: kmovw %r11d, %k1 ; AVX512-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} @@ -4472,9 +4472,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] ; AVX512-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] ; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} @@ -4484,18 +4484,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-NEXT: vpermd %zmm6, %zmm20, %zmm14 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] ; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] @@ -4704,9 +4704,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm31, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] ; AVX512-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm31, %zmm23 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm23 {%k1} @@ -4735,9 +4735,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vpermd %zmm11, %zmm0, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm13, %zmm11 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm27, %zmm0, %zmm21 ; AVX512-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm21 {%k2} @@ -4745,21 +4745,21 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm13, %zmm3 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm13, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 {%k1} # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] ; AVX512-FCP-NEXT: vpermd %zmm12, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vpermd %zmm28, %zmm4, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm4, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm13, %zmm13 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm13 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm15, %zmm4, %zmm15 @@ -4802,9 +4802,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-NEXT: kmovw %r11d, %k1 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} @@ -4814,9 +4814,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] ; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512DQ-NEXT: kmovw %r11d, %k2 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} @@ -4826,18 +4826,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] @@ -5046,9 +5046,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm24, %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm31, %zmm23 ; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm23 {%k1} @@ -5077,9 +5077,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm0, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm13, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm21 {%k2} @@ -5087,21 +5087,21 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm13, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm13, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 {%k1} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] ; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm4, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm4, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm13, %zmm13 ; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm4, %zmm15 @@ -5144,16 +5144,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -5161,80 +5161,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-86, %cl ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -5262,16 +5262,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -5279,80 +5279,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-86, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -5380,16 +5380,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -5397,80 +5397,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-86, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -5498,16 +5498,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -5515,80 +5515,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-86, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -8126,51 +8126,51 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm6 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm13 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm10 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%r10), %xmm6 @@ -8181,7 +8181,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] @@ -8199,17 +8199,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm14, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm13 @@ -8218,14 +8218,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -8259,12 +8259,12 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm13, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm13, %ymm5 ; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm14 @@ -8273,16 +8273,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -8315,11 +8315,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,2,2,2,u,u,3,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,2,2,0,0,3,3] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] @@ -8327,16 +8327,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,u,u,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,u,0,u,u,u,1,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,1,1,1,1,u,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,u,1,u,1,u,u,u] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,1,0] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -8354,10 +8354,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%r10), %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm11 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 @@ -8365,45 +8365,45 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 @@ -8427,35 +8427,35 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] @@ -8484,33 +8484,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm13, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] @@ -8520,7 +8520,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -8532,7 +8532,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 @@ -8540,43 +8540,43 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm15 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm13, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] @@ -8662,9 +8662,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %xmm7 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] ; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} @@ -8677,9 +8677,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%r9), %ymm8 ; AVX512-NEXT: vmovdqa 96(%r8), %ymm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 96(%rcx), %ymm10 @@ -8687,10 +8687,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm13 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-NEXT: vpermd %zmm1, %zmm16, %zmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] @@ -8827,9 +8827,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermd %zmm6, %zmm30, %zmm8 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; AVX512-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] ; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] ; AVX512-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -9145,9 +9145,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -9168,9 +9168,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm19 {%k2} ; AVX512-FCP-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload @@ -9186,10 +9186,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload @@ -9205,15 +9205,15 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm23 ; AVX512-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm23 {%k2} ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 @@ -9312,9 +9312,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] ; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-NEXT: kmovw %r11d, %k2 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} @@ -9327,9 +9327,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm10 @@ -9337,10 +9337,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm13 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] @@ -9477,9 +9477,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermd %zmm6, %zmm30, %zmm8 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] ; AVX512DQ-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -9795,9 +9795,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -9818,9 +9818,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload @@ -9836,10 +9836,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload @@ -9855,15 +9855,15 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm23 ; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm23 {%k2} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 @@ -9957,57 +9957,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -10029,10 +10029,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -10053,33 +10053,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10098,28 +10098,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -10251,57 +10251,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -10323,10 +10323,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -10347,33 +10347,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10392,28 +10392,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -10545,57 +10545,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -10617,10 +10617,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -10641,33 +10641,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10686,28 +10686,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -10839,57 +10839,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -10911,10 +10911,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -10935,33 +10935,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10980,28 +10980,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index e22a98cbbb56d..e333e47219116 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -560,9 +560,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -573,9 +573,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -586,9 +586,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -599,9 +599,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -612,9 +612,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -625,9 +625,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -638,9 +638,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -651,9 +651,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -900,10 +900,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -920,10 +920,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -940,10 +940,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -960,10 +960,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -980,10 +980,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1000,10 +1000,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1020,10 +1020,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1040,10 +1040,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1545,10 +1545,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1579,10 +1579,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1613,10 +1613,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1647,10 +1647,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1681,10 +1681,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1715,10 +1715,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1749,10 +1749,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1783,10 +1783,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 028a27cd88be5..de2e1df4c5566 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -586,9 +586,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -600,9 +600,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -614,9 +614,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -628,9 +628,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -642,9 +642,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -656,9 +656,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -670,9 +670,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -684,9 +684,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1001,17 +1001,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1024,17 +1024,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1047,17 +1047,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1070,17 +1070,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1093,17 +1093,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1116,17 +1116,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1139,17 +1139,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1162,17 +1162,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1821,20 +1821,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1859,20 +1859,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1897,20 +1897,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1935,20 +1935,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1973,20 +1973,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2011,20 +2011,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2049,20 +2049,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2087,20 +2087,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -3493,20 +3493,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3559,20 +3559,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3625,20 +3625,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3691,20 +3691,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3757,20 +3757,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3823,20 +3823,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3889,20 +3889,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3955,20 +3955,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index 2581010d42744..4beed72f22e33 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -665,9 +665,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm2, (%r8) @@ -680,9 +680,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -695,9 +695,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r8) @@ -710,9 +710,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -725,9 +725,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -740,9 +740,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -755,9 +755,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -770,9 +770,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1201,26 +1201,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1236,26 +1236,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1271,26 +1271,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1306,26 +1306,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1341,26 +1341,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1376,26 +1376,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1411,26 +1411,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1446,26 +1446,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2373,32 +2373,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2434,32 +2434,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2495,32 +2495,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2556,32 +2556,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2617,32 +2617,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2678,32 +2678,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2739,32 +2739,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2800,32 +2800,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -4733,32 +4733,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4846,32 +4846,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4959,32 +4959,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5072,32 +5072,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5185,32 +5185,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5298,32 +5298,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5411,32 +5411,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5524,32 +5524,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 834adf1aab5ab..58991d65cf1ee 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -73,7 +73,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovd %eax, %xmm3 ; AVX2-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -96,7 +96,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovq %rax, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovd %eax, %xmm3 ; AVX2-FP-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -119,7 +119,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovq %rax, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovd %eax, %xmm3 ; AVX2-FCP-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -474,9 +474,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-NEXT: vmovdqa %xmm1, 64(%r9) @@ -491,9 +491,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -508,9 +508,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-NEXT: vmovdqa %xmm1, 64(%r9) @@ -525,9 +525,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -542,9 +542,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -559,9 +559,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -576,9 +576,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -593,9 +593,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -946,15 +946,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) @@ -971,15 +971,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -996,15 +996,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1021,15 +1021,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1046,15 +1046,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1071,15 +1071,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1096,15 +1096,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1121,15 +1121,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1849,7 +1849,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1860,7 +1860,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1871,7 +1871,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1880,7 +1880,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -1889,7 +1889,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1915,7 +1915,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1926,7 +1926,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1937,7 +1937,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1946,7 +1946,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -1955,7 +1955,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1981,7 +1981,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1992,7 +1992,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2003,7 +2003,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2012,7 +2012,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2021,7 +2021,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2047,7 +2047,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2058,7 +2058,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2069,7 +2069,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2078,7 +2078,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2087,7 +2087,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2113,7 +2113,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2124,7 +2124,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2135,7 +2135,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2144,7 +2144,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2153,7 +2153,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2179,7 +2179,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2190,7 +2190,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2201,7 +2201,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2210,7 +2210,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2219,7 +2219,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2245,7 +2245,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2256,7 +2256,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2267,7 +2267,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2276,7 +2276,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2285,7 +2285,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2311,7 +2311,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2322,7 +2322,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2333,7 +2333,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2342,7 +2342,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2351,7 +2351,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -3919,7 +3919,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -3932,7 +3932,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -3943,7 +3943,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -3954,7 +3954,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -3967,7 +3967,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4025,7 +4025,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4038,7 +4038,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4049,7 +4049,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4060,7 +4060,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4073,7 +4073,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4131,7 +4131,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4144,7 +4144,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4155,7 +4155,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4166,7 +4166,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4179,7 +4179,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4237,7 +4237,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4250,7 +4250,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4261,7 +4261,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4272,7 +4272,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4285,7 +4285,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4343,7 +4343,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4356,7 +4356,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4367,7 +4367,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4378,7 +4378,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4391,7 +4391,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4449,7 +4449,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4462,7 +4462,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4473,7 +4473,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4484,7 +4484,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4497,7 +4497,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4555,7 +4555,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4568,7 +4568,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4579,7 +4579,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4590,7 +4590,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4603,7 +4603,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -4661,7 +4661,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] @@ -4674,7 +4674,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] @@ -4685,7 +4685,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] @@ -4696,7 +4696,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] @@ -4709,7 +4709,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 @@ -8155,16 +8155,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8369,16 +8369,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8583,16 +8583,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8797,16 +8797,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9011,16 +9011,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9225,16 +9225,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9439,16 +9439,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9653,16 +9653,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 402b802cd035b..0cd72be39557a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -536,9 +536,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rax) @@ -555,9 +555,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -574,9 +574,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax) @@ -593,9 +593,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -612,9 +612,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -631,9 +631,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -650,9 +650,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -669,9 +669,9 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1064,12 +1064,10 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [6,0,0,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,6,0,0,0,0,0,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] @@ -1098,17 +1096,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1125,17 +1123,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1152,17 +1150,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1179,17 +1177,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1206,17 +1204,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1233,17 +1231,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1260,17 +1258,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1287,17 +1285,17 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2124,11 +2122,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2154,7 +2152,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm7 @@ -2205,24 +2203,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 ; AVX512-NEXT: movb $36, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2233,9 +2231,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-110, %cl ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2244,9 +2242,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2254,18 +2252,18 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) @@ -2294,11 +2292,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $-110, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2306,9 +2304,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $36, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2317,9 +2315,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2328,19 +2326,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2349,9 +2347,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2375,24 +2373,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512DQ-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 ; AVX512DQ-NEXT: movb $36, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2403,9 +2401,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-110, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2414,9 +2412,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2424,18 +2422,18 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) @@ -2464,11 +2462,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $-110, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2476,9 +2474,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $36, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2487,9 +2485,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2498,19 +2496,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2519,9 +2517,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2545,24 +2543,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 ; AVX512BW-NEXT: movb $36, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2573,9 +2571,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-110, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2584,9 +2582,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2594,18 +2592,18 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) @@ -2634,11 +2632,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-110, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2646,9 +2644,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $36, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2657,9 +2655,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2668,19 +2666,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2689,9 +2687,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2715,24 +2713,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512DQ-BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 ; AVX512DQ-BW-NEXT: movb $36, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2743,9 +2741,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-110, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2754,9 +2752,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2764,18 +2762,18 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) @@ -2804,11 +2802,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-110, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2816,9 +2814,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $36, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2827,9 +2825,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2838,19 +2836,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2859,9 +2857,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -4623,10 +4621,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -4798,27 +4796,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512-NEXT: movb $36, %dl ; AVX512-NEXT: kmovw %edx, %k1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 ; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -4827,18 +4825,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-110, %cl ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -4860,9 +4858,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] @@ -4874,9 +4872,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -4923,7 +4921,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -4944,7 +4942,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4974,22 +4972,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5000,21 +4998,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5073,27 +5071,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512DQ-NEXT: movb $36, %dl ; AVX512DQ-NEXT: kmovw %edx, %k1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5102,18 +5100,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-110, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5135,9 +5133,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] @@ -5149,9 +5147,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5198,7 +5196,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5219,7 +5217,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5249,22 +5247,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5275,21 +5273,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5348,27 +5346,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512BW-NEXT: movb $36, %dl ; AVX512BW-NEXT: kmovd %edx, %k1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5377,18 +5375,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-110, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5410,9 +5408,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] @@ -5424,9 +5422,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5473,7 +5471,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5494,7 +5492,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5524,22 +5522,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5550,21 +5548,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5623,27 +5621,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512DQ-BW-NEXT: movb $36, %dl ; AVX512DQ-BW-NEXT: kmovd %edx, %k1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5652,18 +5650,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-110, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5685,9 +5683,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] @@ -5699,9 +5697,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5748,7 +5746,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5769,7 +5767,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5799,22 +5797,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5825,21 +5823,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -9497,10 +9495,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9854,7 +9852,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9898,7 +9896,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] ; AVX512-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512-NEXT: movb $36, %al ; AVX512-NEXT: kmovw %eax, %k1 @@ -9919,7 +9917,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 @@ -9975,7 +9973,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 @@ -9983,7 +9981,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 @@ -9991,7 +9989,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 @@ -10008,14 +10006,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 @@ -10028,33 +10026,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 @@ -10111,7 +10109,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 @@ -10192,7 +10190,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -10246,22 +10244,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -10319,28 +10317,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -10423,7 +10421,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10467,7 +10465,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] ; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512DQ-NEXT: movb $36, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -10488,7 +10486,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm3 @@ -10544,7 +10542,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 @@ -10552,7 +10550,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 @@ -10560,7 +10558,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 @@ -10577,14 +10575,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 @@ -10597,33 +10595,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 @@ -10680,7 +10678,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 @@ -10761,7 +10759,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -10815,22 +10813,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -10888,28 +10886,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -10992,7 +10990,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11036,7 +11034,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] ; AVX512BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512BW-NEXT: movb $36, %al ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -11057,7 +11055,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 @@ -11113,7 +11111,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 @@ -11121,7 +11119,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 @@ -11129,7 +11127,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 @@ -11146,14 +11144,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 @@ -11166,33 +11164,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 @@ -11249,7 +11247,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 @@ -11330,7 +11328,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -11384,22 +11382,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -11457,28 +11455,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -11561,7 +11559,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11605,7 +11603,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] ; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512DQ-BW-NEXT: movb $36, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -11626,7 +11624,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm3 @@ -11682,7 +11680,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 @@ -11690,7 +11688,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 @@ -11698,7 +11696,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 @@ -11715,14 +11713,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 @@ -11735,33 +11733,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 @@ -11818,7 +11816,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 @@ -11899,7 +11897,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -11953,22 +11951,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -12026,28 +12024,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 9b598c608c210..837d990596a5a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -201,7 +201,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -225,10 +225,10 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -253,7 +253,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -277,10 +277,10 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -305,7 +305,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -329,10 +329,10 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -357,7 +357,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -381,10 +381,10 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -683,9 +683,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -705,9 +705,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -727,9 +727,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -749,9 +749,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -771,9 +771,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -793,9 +793,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -815,9 +815,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -837,9 +837,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1422,29 +1422,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-NEXT: kmovw %ecx, %k1 @@ -1468,29 +1466,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -1514,29 +1510,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -1560,29 +1554,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -1606,29 +1598,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -1652,29 +1642,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -1698,29 +1686,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -1744,29 +1730,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3104,96 +3088,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-NEXT: kmovw %ecx, %k3 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512-NEXT: kmovw %ecx, %k1 @@ -3219,96 +3203,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-FCP-NEXT: kmovw %ecx, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -3334,96 +3318,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-NEXT: kmovw %ecx, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -3449,96 +3433,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -3564,96 +3548,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512BW-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512BW-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -3679,96 +3663,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3794,96 +3778,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -3909,96 +3893,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -6774,42 +6758,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -6820,7 +6804,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -6830,7 +6814,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -6838,14 +6822,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512-NEXT: kmovw %eax, %k2 @@ -6853,56 +6837,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512-NEXT: kmovw %eax, %k2 @@ -6913,17 +6897,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -6931,15 +6915,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -6977,42 +6961,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7023,7 +7007,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7033,7 +7017,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7041,14 +7025,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512-FCP-NEXT: kmovw %eax, %k2 @@ -7056,56 +7040,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %eax, %k2 @@ -7116,17 +7100,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7134,15 +7118,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7180,42 +7164,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7226,7 +7210,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7236,7 +7220,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7244,14 +7228,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-NEXT: kmovw %eax, %k2 @@ -7259,56 +7243,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %eax, %k2 @@ -7319,17 +7303,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7337,15 +7321,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7383,42 +7367,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7429,7 +7413,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7439,7 +7423,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7447,14 +7431,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 @@ -7462,56 +7446,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 @@ -7522,17 +7506,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7540,15 +7524,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7586,42 +7570,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7632,7 +7616,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7642,7 +7626,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7650,14 +7634,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -7665,56 +7649,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -7725,17 +7709,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7743,15 +7727,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7789,42 +7773,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7835,7 +7819,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7845,7 +7829,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7853,14 +7837,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -7868,56 +7852,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -7928,17 +7912,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7946,15 +7930,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7992,42 +7976,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -8038,7 +8022,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -8048,7 +8032,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -8056,14 +8040,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -8071,56 +8055,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -8131,17 +8115,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -8149,15 +8133,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -8195,42 +8179,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -8241,7 +8225,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -8251,7 +8235,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -8259,14 +8243,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-BW-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -8274,56 +8258,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -8334,17 +8318,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -8352,15 +8336,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -13970,47 +13954,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -14050,23 +14034,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -14126,29 +14110,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14208,7 +14192,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -14235,7 +14219,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -14250,7 +14234,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-NEXT: movw $-30962, %cx # imm = 0x870E @@ -14277,7 +14261,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -14289,7 +14273,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 @@ -14315,12 +14299,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512-NEXT: movw $3612, %ax # imm = 0xE1C @@ -14341,23 +14325,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -14376,25 +14360,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -14446,47 +14430,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -14526,23 +14510,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -14602,29 +14586,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14684,7 +14668,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -14711,7 +14695,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -14726,7 +14710,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -14753,7 +14737,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -14765,7 +14749,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -14791,12 +14775,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -14817,23 +14801,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -14852,25 +14836,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -14922,47 +14906,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15002,23 +14986,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -15078,29 +15062,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15160,7 +15144,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -15187,7 +15171,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -15202,7 +15186,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-NEXT: movw $-30962, %cx # imm = 0x870E @@ -15229,7 +15213,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -15241,7 +15225,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 @@ -15267,12 +15251,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-NEXT: movw $3612, %ax # imm = 0xE1C @@ -15293,23 +15277,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -15328,25 +15312,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -15398,47 +15382,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15478,23 +15462,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -15554,29 +15538,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15636,7 +15620,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -15663,7 +15647,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -15678,7 +15662,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -15705,7 +15689,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -15717,7 +15701,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -15743,12 +15727,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -15769,23 +15753,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -15804,25 +15788,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -15874,47 +15858,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15954,23 +15938,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -16030,29 +16014,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16112,7 +16096,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -16139,7 +16123,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -16154,7 +16138,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E @@ -16181,7 +16165,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -16193,7 +16177,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 @@ -16219,12 +16203,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C @@ -16245,23 +16229,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -16280,25 +16264,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -16350,47 +16334,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -16430,23 +16414,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -16506,29 +16490,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16588,7 +16572,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -16615,7 +16599,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -16630,7 +16614,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -16657,7 +16641,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -16669,7 +16653,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -16695,12 +16679,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -16721,23 +16705,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -16756,25 +16740,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -16826,47 +16810,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -16906,23 +16890,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -16982,29 +16966,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -17064,7 +17048,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -17091,7 +17075,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -17106,7 +17090,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $-30962, %cx # imm = 0x870E @@ -17133,7 +17117,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -17145,7 +17129,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 @@ -17171,12 +17155,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: movw $3612, %ax # imm = 0xE1C @@ -17197,23 +17181,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -17232,25 +17216,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -17302,47 +17286,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -17382,23 +17366,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -17458,29 +17442,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -17540,7 +17524,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -17567,7 +17551,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -17582,7 +17566,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -17609,7 +17593,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -17621,7 +17605,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -17647,12 +17631,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -17673,23 +17657,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -17708,25 +17692,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 59f03a65a1fde..955927eb76912 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -659,9 +659,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -683,9 +683,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -707,9 +707,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -731,9 +731,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -755,9 +755,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -779,9 +779,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -803,9 +803,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -827,9 +827,9 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1456,26 +1456,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-52, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1498,26 +1498,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-52, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1540,26 +1540,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-52, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1582,26 +1582,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-52, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1624,26 +1624,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-52, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1666,26 +1666,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-52, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1708,26 +1708,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-52, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1750,26 +1750,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -3049,16 +3049,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-NEXT: movb $-120, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: movb $34, %cl ; AVX512-NEXT: kmovw %ecx, %k2 @@ -3066,80 +3066,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-52, %cl ; AVX512-NEXT: kmovw %ecx, %k3 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3167,16 +3167,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-FCP-NEXT: movb $-120, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: movb $34, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 @@ -3184,80 +3184,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $-52, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3285,16 +3285,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-NEXT: movb $-120, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: movb $34, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 @@ -3302,80 +3302,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-52, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3403,16 +3403,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-120, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: movb $34, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 @@ -3420,80 +3420,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $-52, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3521,16 +3521,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: movb $-120, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $34, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -3538,80 +3538,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-52, %cl ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3639,16 +3639,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movb $-120, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: movb $34, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -3656,80 +3656,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-52, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3757,16 +3757,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movb $-120, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: movb $34, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -3774,80 +3774,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-52, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3875,16 +3875,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-120, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $34, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -3892,80 +3892,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-52, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -6392,57 +6392,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -6464,10 +6464,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -6488,33 +6488,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -6533,28 +6533,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -6686,57 +6686,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -6758,10 +6758,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -6782,33 +6782,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -6827,28 +6827,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -6980,57 +6980,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7052,10 +7052,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7076,33 +7076,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7121,28 +7121,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7274,57 +7274,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7346,10 +7346,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7370,33 +7370,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7415,28 +7415,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7568,57 +7568,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7640,10 +7640,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7664,33 +7664,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7709,28 +7709,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7862,57 +7862,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7934,10 +7934,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7958,33 +7958,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -8003,28 +8003,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -8156,57 +8156,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -8228,10 +8228,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -8252,33 +8252,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -8297,28 +8297,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -8450,57 +8450,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -8522,10 +8522,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -8546,33 +8546,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -8591,28 +8591,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -13565,41 +13565,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -13668,42 +13668,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -13772,42 +13772,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13876,35 +13876,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14219,41 +14219,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -14322,42 +14322,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -14426,42 +14426,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -14530,35 +14530,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14873,41 +14873,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -14976,42 +14976,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -15080,42 +15080,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -15184,35 +15184,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15527,41 +15527,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -15630,42 +15630,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -15734,42 +15734,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -15838,35 +15838,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16181,41 +16181,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -16284,42 +16284,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -16388,42 +16388,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -16492,35 +16492,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16835,41 +16835,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -16938,42 +16938,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -17042,42 +17042,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -17146,35 +17146,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -17489,41 +17489,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -17592,42 +17592,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -17696,42 +17696,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -17800,35 +17800,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -18143,41 +18143,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -18246,42 +18246,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -18350,42 +18350,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -18454,35 +18454,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index 67c6c3e820e33..67b447ed5d014 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -445,9 +445,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -458,9 +458,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -471,9 +471,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -484,9 +484,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -497,9 +497,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -510,9 +510,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -523,9 +523,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -536,9 +536,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -801,10 +801,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -821,10 +821,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -841,10 +841,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -861,10 +861,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -881,10 +881,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -901,10 +901,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -921,10 +921,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -941,10 +941,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -1479,10 +1479,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1513,10 +1513,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1547,10 +1547,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1581,10 +1581,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1615,10 +1615,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1649,10 +1649,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1683,10 +1683,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1717,10 +1717,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -2905,10 +2905,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -2967,10 +2967,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3029,10 +3029,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3091,10 +3091,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3153,10 +3153,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3215,10 +3215,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3277,10 +3277,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3339,10 +3339,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 196eeca69dd8c..a01d4de0027f4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -317,9 +317,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -331,9 +331,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -345,9 +345,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -359,9 +359,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -373,9 +373,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -387,9 +387,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -401,9 +401,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -415,9 +415,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -648,17 +648,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -671,17 +671,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -694,17 +694,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -717,17 +717,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -740,17 +740,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -763,17 +763,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -786,17 +786,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -809,17 +809,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1260,20 +1260,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1298,20 +1298,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1336,20 +1336,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1374,20 +1374,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1412,20 +1412,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1450,20 +1450,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1488,20 +1488,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1526,20 +1526,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -2505,20 +2505,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2571,20 +2571,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2637,20 +2637,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2703,20 +2703,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2769,20 +2769,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2835,20 +2835,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2901,20 +2901,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2967,20 +2967,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -5032,19 +5032,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5153,19 +5153,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5274,19 +5274,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5395,19 +5395,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5516,19 +5516,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5637,19 +5637,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5758,19 +5758,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5879,19 +5879,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index 82e40102a7e41..38623c6ce0cb0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -353,7 +353,7 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) @@ -392,7 +392,7 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) @@ -431,7 +431,7 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) @@ -470,7 +470,7 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) @@ -746,26 +746,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -781,26 +781,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -816,26 +816,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -851,26 +851,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -886,26 +886,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -921,26 +921,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -956,26 +956,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -991,26 +991,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1600,32 +1600,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1661,32 +1661,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1722,32 +1722,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1783,32 +1783,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1844,32 +1844,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1905,32 +1905,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1966,32 +1966,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -2027,32 +2027,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -3322,32 +3322,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3435,32 +3435,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3548,32 +3548,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3661,32 +3661,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3774,32 +3774,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3887,32 +3887,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4000,32 +4000,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4113,32 +4113,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -6763,19 +6763,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -6855,19 +6855,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7058,19 +7058,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7150,19 +7150,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7353,19 +7353,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7445,19 +7445,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7648,19 +7648,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7740,19 +7740,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7943,19 +7943,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8035,19 +8035,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8238,19 +8238,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8330,19 +8330,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8533,19 +8533,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8625,19 +8625,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8828,19 +8828,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8920,19 +8920,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index f6e27e54cca97..ffdbdea024ea0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -123,9 +123,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-NEXT: vmovdqa %xmm1, 64(%r9) @@ -140,9 +140,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -157,9 +157,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-NEXT: vmovdqa %xmm1, 64(%r9) @@ -174,9 +174,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -191,9 +191,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -208,9 +208,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -225,9 +225,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -242,9 +242,9 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -453,15 +453,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) @@ -477,15 +477,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -501,15 +501,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) @@ -525,15 +525,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -549,15 +549,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) @@ -573,15 +573,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -597,15 +597,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) @@ -621,15 +621,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -1007,9 +1007,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-NEXT: movb $49, %al ; AVX512-NEXT: kmovw %eax, %k1 @@ -1017,39 +1017,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $8, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: movb $-116, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1066,9 +1066,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-FCP-NEXT: movb $49, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -1076,39 +1076,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $8, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-FCP-NEXT: movb $-116, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1125,9 +1125,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-NEXT: movb $49, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -1135,39 +1135,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movb $8, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-NEXT: movb $-116, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1184,9 +1184,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: movb $49, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -1194,39 +1194,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $8, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: movb $-116, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1243,9 +1243,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -1253,39 +1253,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1302,9 +1302,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: movb $49, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -1312,39 +1312,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $8, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: movb $-116, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1361,9 +1361,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: movb $49, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -1371,39 +1371,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movb $8, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: movb $-116, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -1420,9 +1420,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $49, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -1430,39 +1430,39 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $8, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $-116, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -2316,10 +2316,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512-NEXT: movb $49, %al @@ -2332,47 +2332,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512-NEXT: movb $-116, %al ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2415,10 +2415,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512-FCP-NEXT: movb $49, %al @@ -2431,47 +2431,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512-FCP-NEXT: movb $-116, %al ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2514,10 +2514,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-NEXT: movb $49, %al @@ -2530,47 +2530,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-NEXT: movb $-116, %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2613,10 +2613,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: movb $49, %al @@ -2629,47 +2629,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: movb $-116, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2712,10 +2712,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512BW-NEXT: movb $49, %al @@ -2728,47 +2728,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2811,10 +2811,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: movb $49, %al @@ -2827,47 +2827,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512BW-FCP-NEXT: movb $-116, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2910,10 +2910,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: movb $49, %al @@ -2926,47 +2926,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-BW-NEXT: movb $-116, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -3009,10 +3009,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $49, %al @@ -3025,47 +3025,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $-116, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -4966,11 +4966,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4981,17 +4981,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -4999,7 +4999,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5018,7 +5018,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5028,7 +5028,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5089,15 +5089,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5168,11 +5168,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5183,17 +5183,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -5201,7 +5201,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5220,7 +5220,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5230,7 +5230,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5291,15 +5291,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5370,11 +5370,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5385,17 +5385,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -5403,7 +5403,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5422,7 +5422,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5432,7 +5432,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5493,15 +5493,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5572,11 +5572,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5587,17 +5587,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -5605,7 +5605,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5624,7 +5624,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5634,7 +5634,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5695,15 +5695,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5774,11 +5774,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5789,17 +5789,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -5807,7 +5807,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5826,7 +5826,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5836,7 +5836,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5897,15 +5897,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5976,11 +5976,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5991,17 +5991,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -6009,7 +6009,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6028,7 +6028,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6038,7 +6038,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6099,15 +6099,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6178,11 +6178,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6193,17 +6193,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -6211,7 +6211,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6230,7 +6230,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6240,7 +6240,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6301,15 +6301,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6380,11 +6380,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6395,17 +6395,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] @@ -6413,7 +6413,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6432,7 +6432,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6442,7 +6442,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6503,15 +6503,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10535,7 +10535,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -10551,15 +10551,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10654,20 +10654,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10804,16 +10804,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10998,7 +10998,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11014,15 +11014,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11117,20 +11117,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -11267,16 +11267,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11461,7 +11461,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11477,15 +11477,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11580,20 +11580,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -11730,16 +11730,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11924,7 +11924,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11940,15 +11940,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12043,20 +12043,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -12193,16 +12193,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12387,7 +12387,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -12403,15 +12403,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12506,20 +12506,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -12656,16 +12656,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12850,7 +12850,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -12866,15 +12866,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12969,20 +12969,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13119,16 +13119,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13313,7 +13313,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -13329,15 +13329,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13432,20 +13432,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13582,16 +13582,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13776,7 +13776,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -13792,15 +13792,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13895,20 +13895,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -14045,16 +14045,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index b92614ee7b196..99bcebd28f120 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -139,9 +139,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rax) @@ -158,9 +158,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -177,9 +177,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax) @@ -196,9 +196,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -215,9 +215,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -234,9 +234,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -253,9 +253,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -272,9 +272,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,7,9,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,7,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -536,18 +536,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -564,18 +564,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -592,18 +592,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -620,18 +620,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -648,18 +648,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -676,18 +676,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -704,18 +704,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -732,18 +732,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1236,8 +1236,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512-NEXT: movb $12, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 @@ -1255,9 +1254,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1266,9 +1265,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1278,9 +1277,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1290,16 +1289,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1321,8 +1320,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512-FCP-NEXT: movb $12, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 @@ -1340,9 +1338,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1351,9 +1349,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1363,9 +1361,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1375,16 +1373,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1406,8 +1404,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-NEXT: movb $12, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 @@ -1425,9 +1422,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1436,9 +1433,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1448,9 +1445,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1460,16 +1457,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1491,8 +1488,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: movb $12, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 @@ -1510,9 +1506,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1521,9 +1517,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1533,9 +1529,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1545,16 +1541,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1576,8 +1572,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 @@ -1595,9 +1590,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1606,9 +1601,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1618,9 +1613,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1630,16 +1625,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1661,8 +1656,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: movb $12, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 @@ -1680,9 +1674,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1691,9 +1685,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1703,9 +1697,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1715,16 +1709,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1746,8 +1740,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: movb $12, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 @@ -1765,9 +1758,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1776,9 +1769,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1788,9 +1781,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1800,16 +1793,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1831,8 +1824,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $12, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 @@ -1850,9 +1842,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1861,9 +1853,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -1873,9 +1865,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1885,16 +1877,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -2990,8 +2982,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512-NEXT: movb $12, %r10b @@ -3013,9 +3004,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3028,9 +3019,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3041,9 +3032,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3070,7 +3061,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3078,7 +3069,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3087,9 +3078,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3129,8 +3120,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512-FCP-NEXT: movb $12, %r10b @@ -3152,9 +3142,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3167,9 +3157,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3180,9 +3170,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3209,7 +3199,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3217,7 +3207,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3226,9 +3216,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3268,8 +3258,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-NEXT: movb $12, %r10b @@ -3291,9 +3280,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3306,9 +3295,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3319,9 +3308,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3348,7 +3337,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3356,7 +3345,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3365,9 +3354,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3407,8 +3396,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: movb $12, %r10b @@ -3430,9 +3418,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3445,9 +3433,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3458,9 +3446,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3487,7 +3475,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3495,7 +3483,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3504,9 +3492,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3546,8 +3534,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512BW-NEXT: movb $12, %r10b @@ -3569,9 +3556,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3584,9 +3571,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3597,9 +3584,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3626,7 +3613,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3634,7 +3621,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3643,9 +3630,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3685,8 +3672,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movb $12, %r10b @@ -3708,9 +3694,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3723,9 +3709,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3736,9 +3722,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3765,7 +3751,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3773,7 +3759,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3782,9 +3768,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3824,8 +3810,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movb $12, %r10b @@ -3847,9 +3832,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3862,9 +3847,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3875,9 +3860,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -3904,7 +3889,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -3912,7 +3897,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3921,9 +3906,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -3963,8 +3948,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movb $12, %r10b @@ -3986,9 +3970,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4001,9 +3985,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4014,9 +3998,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 @@ -4043,7 +4027,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] @@ -4051,7 +4035,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4060,9 +4044,9 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 @@ -6459,8 +6443,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -6529,12 +6512,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -6543,23 +6526,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -6583,7 +6566,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -6598,7 +6581,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $16, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -6617,9 +6600,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -6737,8 +6720,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -6807,12 +6789,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -6821,23 +6803,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -6861,7 +6843,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -6876,7 +6858,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $16, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -6895,9 +6877,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -7015,8 +6997,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -7085,12 +7066,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -7099,23 +7080,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -7139,7 +7120,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -7154,7 +7135,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $16, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -7173,9 +7154,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -7293,8 +7274,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -7363,12 +7343,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -7377,23 +7357,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -7417,7 +7397,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -7432,7 +7412,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $16, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -7451,9 +7431,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -7571,8 +7551,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -7641,12 +7620,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -7655,23 +7634,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -7695,7 +7674,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -7710,7 +7689,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $16, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -7729,9 +7708,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -7849,8 +7828,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -7919,12 +7897,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -7933,23 +7911,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -7973,7 +7951,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -7988,7 +7966,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $16, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -8007,9 +7985,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -8127,8 +8105,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -8197,12 +8174,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -8211,23 +8188,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -8251,7 +8228,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -8266,7 +8243,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $16, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -8285,9 +8262,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -8405,8 +8382,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 @@ -8475,12 +8451,12 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 @@ -8489,23 +8465,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 @@ -8529,7 +8505,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 @@ -8544,7 +8520,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $16, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -8563,9 +8539,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 @@ -13591,8 +13567,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -13754,7 +13729,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -13779,7 +13754,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -13797,7 +13772,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -13825,7 +13800,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -13875,7 +13850,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13904,7 +13879,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13926,7 +13901,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13988,7 +13963,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -14004,10 +13979,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -14287,8 +14262,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -14450,7 +14424,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -14475,7 +14449,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -14493,7 +14467,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -14521,7 +14495,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -14571,7 +14545,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14600,7 +14574,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14622,7 +14596,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14684,7 +14658,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -14700,10 +14674,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -14983,8 +14957,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -15146,7 +15119,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -15171,7 +15144,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -15189,7 +15162,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -15217,7 +15190,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -15267,7 +15240,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15296,7 +15269,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15318,7 +15291,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15380,7 +15353,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -15396,10 +15369,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -15679,8 +15652,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -15842,7 +15814,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -15867,7 +15839,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -15885,7 +15857,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -15913,7 +15885,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -15963,7 +15935,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15992,7 +15964,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16014,7 +15986,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16076,7 +16048,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -16092,10 +16064,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -16375,8 +16347,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -16538,7 +16509,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -16563,7 +16534,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -16581,7 +16552,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -16609,7 +16580,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -16659,7 +16630,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16688,7 +16659,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16710,7 +16681,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16772,7 +16743,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -16788,10 +16759,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -17071,8 +17042,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -17234,7 +17204,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -17259,7 +17229,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -17277,7 +17247,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -17305,7 +17275,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -17355,7 +17325,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17384,7 +17354,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17406,7 +17376,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17468,7 +17438,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -17484,10 +17454,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -17767,8 +17737,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -17930,7 +17899,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -17955,7 +17924,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -17973,7 +17942,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -18001,7 +17970,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -18051,7 +18020,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18080,7 +18049,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18102,7 +18071,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18164,7 +18133,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -18180,10 +18149,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -18463,8 +18432,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 @@ -18626,7 +18594,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 @@ -18651,7 +18619,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 @@ -18669,7 +18637,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload @@ -18697,7 +18665,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -18747,7 +18715,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18776,7 +18744,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18798,7 +18766,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18860,7 +18828,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 @@ -18876,10 +18844,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index edcc4b2584595..8ca0e0cb97186 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -172,9 +172,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -194,9 +194,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -216,9 +216,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -238,9 +238,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -260,9 +260,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -282,9 +282,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -304,9 +304,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -326,9 +326,9 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -625,12 +625,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512-NEXT: movb $112, %cl ; AVX512-NEXT: kmovw %ecx, %k1 @@ -638,7 +638,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k1 @@ -646,7 +646,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-NEXT: movb $28, %cl ; AVX512-NEXT: kmovw %ecx, %k1 @@ -670,12 +670,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512-FCP-NEXT: movb $112, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -683,7 +683,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -691,7 +691,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: movb $28, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -715,12 +715,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-NEXT: movb $112, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -728,7 +728,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-NEXT: movb $56, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -736,7 +736,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-NEXT: movb $28, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -760,12 +760,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -773,7 +773,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -781,7 +781,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: movb $28, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -805,12 +805,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512BW-NEXT: movb $112, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -818,7 +818,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -826,7 +826,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: movb $28, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -850,12 +850,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: movb $112, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -863,7 +863,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -871,7 +871,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: movb $28, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -895,12 +895,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: movb $112, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -908,7 +908,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movb $56, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -916,7 +916,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: movb $28, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -940,12 +940,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -953,7 +953,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -961,7 +961,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $28, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -1595,7 +1595,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $96, %sil ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] @@ -1691,7 +1691,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $96, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1732,7 +1732,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,7,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] ; AVX512-FCP-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %sil @@ -1857,7 +1857,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] @@ -1945,7 +1945,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: movb $14, %sil @@ -1975,7 +1975,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -2084,7 +2084,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $96, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] @@ -2180,7 +2180,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $96, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -2221,7 +2221,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %sil @@ -2346,7 +2346,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] @@ -2434,7 +2434,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil @@ -2464,7 +2464,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -3832,15 +3832,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] ; AVX512-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4013,16 +4013,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -4088,7 +4088,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] ; AVX512-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512-FCP-NEXT: movb $96, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 @@ -4244,15 +4244,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4426,7 +4426,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -4519,7 +4519,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] @@ -4569,14 +4569,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) @@ -4656,15 +4656,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4837,16 +4837,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -4912,7 +4912,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm0 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512BW-FCP-NEXT: movb $96, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 @@ -5068,15 +5068,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5250,7 +5250,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5343,7 +5343,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] @@ -5393,14 +5393,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) @@ -8232,15 +8232,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] ; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 ; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k3 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 ; AVX512-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -8352,7 +8352,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512-NEXT: vmovdqa 192(%r8), %ymm7 @@ -8382,17 +8382,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -8482,7 +8482,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %r10d, %k2 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] @@ -8672,19 +8672,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,11,u,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,11,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -8820,18 +8820,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm2 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,12,u,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) @@ -9117,15 +9117,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [12,u,u,3,4,5,6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] ; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 ; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} @@ -9232,7 +9232,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm6 @@ -9263,17 +9263,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1472(%rax) @@ -9365,7 +9365,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] @@ -9553,18 +9553,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,11,u,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,11,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] @@ -9696,17 +9696,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,u,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -9990,15 +9990,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 ; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -10110,7 +10110,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm7 @@ -10140,17 +10140,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -10240,7 +10240,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] @@ -10430,19 +10430,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,11,u,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,11,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -10578,18 +10578,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm2 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,12,u,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) @@ -10875,15 +10875,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 ; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} @@ -10990,7 +10990,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm6 @@ -11021,17 +11021,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1472(%rax) @@ -11123,7 +11123,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] @@ -11311,18 +11311,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] @@ -11454,17 +11454,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -17203,10 +17203,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -17440,20 +17440,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512-NEXT: movb $6, %sil ; AVX512-NEXT: kmovw %esi, %k4 ; AVX512-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: movb $64, %sil ; AVX512-NEXT: kmovw %esi, %k5 @@ -17462,16 +17462,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %esi, %k5 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} ; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k5 @@ -17480,9 +17480,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 ; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -17504,25 +17504,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $8, %sil @@ -18136,7 +18136,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -18187,10 +18187,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 @@ -18425,21 +18425,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 ; AVX512-FCP-NEXT: movb $6, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k5 ; AVX512-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movb $64, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 @@ -18448,15 +18448,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %esi, %k4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} ; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 ; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 @@ -18465,9 +18465,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,9,u,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -18508,23 +18508,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $8, %sil @@ -19121,10 +19121,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 @@ -19358,20 +19358,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-NEXT: movb $6, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 ; AVX512DQ-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 ; AVX512DQ-NEXT: movb $64, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 @@ -19380,16 +19380,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %esi, %k5 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} ; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-NEXT: movb $12, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 @@ -19398,9 +19398,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -19423,25 +19423,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $8, %sil @@ -20044,7 +20044,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -20094,10 +20094,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 @@ -20336,20 +20336,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: movb $6, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k5 ; AVX512DQ-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $64, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 @@ -20358,15 +20358,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,11,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: movb $12, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 @@ -20375,9 +20375,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -20421,23 +20421,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $8, %sil @@ -21032,10 +21032,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -21269,20 +21269,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512BW-NEXT: movb $6, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 ; AVX512BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: movb $64, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 @@ -21291,16 +21291,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %esi, %k5 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} ; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 @@ -21309,9 +21309,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -21333,25 +21333,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $8, %sil @@ -21965,7 +21965,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -22016,10 +22016,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 @@ -22254,21 +22254,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 ; AVX512BW-FCP-NEXT: movb $6, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k5 ; AVX512BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movb $64, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 @@ -22277,15 +22277,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 @@ -22294,9 +22294,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,9,u,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -22337,23 +22337,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $8, %sil @@ -22950,10 +22950,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 @@ -23187,20 +23187,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: movb $6, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: movb $64, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 @@ -23209,16 +23209,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: movb $12, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 @@ -23227,9 +23227,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -23252,25 +23252,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $8, %sil @@ -23873,7 +23873,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -23923,10 +23923,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 @@ -24165,20 +24165,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movb $6, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k5 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $64, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 @@ -24187,15 +24187,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 @@ -24204,9 +24204,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -24250,23 +24250,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $8, %sil diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index 39c5e6a2b617d..e837f14d367b2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -174,9 +174,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -198,9 +198,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -222,9 +222,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -246,9 +246,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -270,9 +270,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -294,9 +294,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -318,9 +318,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -342,9 +342,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll index 9ce0f1c2897bb..53a6d306ef84d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -606,7 +606,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -617,7 +617,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -628,7 +628,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -639,7 +639,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 738c959b5fe10..2b539aecc2ad8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -683,7 +683,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] ; AVX2-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -746,7 +746,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] ; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,0,0,u,1,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] @@ -808,7 +808,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,0,0,u,1,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] @@ -873,7 +873,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -941,7 +941,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -1289,18 +1289,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,ymm8[1,9],zero,zero,zero,ymm8[2,10],zero,zero,zero,ymm8[3,19],zero,zero,zero,ymm8[28,20],zero,zero,zero,ymm8[29,21],zero,zero,zero,ymm8[30,22] ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,13],zero,zero,zero,ymm5[18,22],zero,zero,zero,ymm5[19,23],zero,zero,zero,ymm5[24,28],zero,zero -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6],zero,zero,zero,ymm6[3,7],zero,zero,zero,ymm6[8,12],zero,zero,zero,ymm6[9,17],zero,zero,zero,ymm6[22,18],zero,zero,zero,ymm6[23,19],zero,zero,zero,ymm6[24,28] ; AVX2-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 @@ -1342,7 +1342,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512-NEXT: vpermd %zmm1, %zmm6, %zmm6 ; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] @@ -1369,19 +1369,19 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] @@ -1422,7 +1422,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512DQ-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm6, %zmm6 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] @@ -1449,19 +1449,19 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] @@ -1503,7 +1503,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero ; AVX512BW-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -1528,19 +1528,19 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] ; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero ; AVX512BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -1582,7 +1582,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -1607,19 +1607,19 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 @@ -2223,7 +2223,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] ; AVX2-NEXT: vpermd %ymm3, %ymm9, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] @@ -2233,7 +2233,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -2315,7 +2315,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm9, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] @@ -2325,7 +2325,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -2356,7 +2356,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 @@ -2370,7 +2370,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 @@ -2386,7 +2386,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 @@ -2402,11 +2402,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] @@ -2416,7 +2416,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -2460,7 +2460,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpternlogq $226, %ymm8, %ymm7, %ymm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm6 ; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] @@ -2542,7 +2542,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpternlogq $226, %ymm7, %ymm6, %ymm4 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] @@ -2566,7 +2566,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,u,5,5,5,5,u,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX512-FCP-NEXT: vpandn %ymm8, %ymm9, %ymm8 @@ -2584,7 +2584,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm2, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,u,7,7,7,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) @@ -2625,7 +2625,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpternlogq $226, %ymm8, %ymm7, %ymm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm6 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] @@ -2707,7 +2707,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm7, %ymm6, %ymm4 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] @@ -2731,7 +2731,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,u,5,5,5,5,u,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX512DQ-FCP-NEXT: vpandn %ymm8, %ymm9, %ymm8 @@ -2749,7 +2749,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm2, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,u,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) @@ -2787,7 +2787,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -2806,7 +2806,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4] ; AVX512BW-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -2815,7 +2815,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,u,4,4,4] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4] ; AVX512BW-NEXT: vpermd %ymm0, %ymm6, %ymm6 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] @@ -2879,7 +2879,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -2898,7 +2898,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4] ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -2907,7 +2907,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -2925,7 +2925,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] ; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -2965,7 +2965,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -2984,7 +2984,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4] ; AVX512DQ-BW-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -2993,7 +2993,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,3,u,4,4,4] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4] ; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm6, %ymm6 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] @@ -3057,7 +3057,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 @@ -3076,7 +3076,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -3085,7 +3085,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 @@ -3103,7 +3103,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -4252,7 +4252,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,u,4,4,4,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,0,4,4,4,4] ; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm0 @@ -4273,7 +4273,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm4 ; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] ; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 @@ -4514,7 +4514,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [3,3,3,u,4,4,4,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,3,3,0,4,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm6, %ymm4 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 @@ -4535,7 +4535,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm4 ; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,3,u,4,4,4] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 @@ -4635,7 +4635,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm3 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 @@ -4684,7 +4684,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] ; AVX2-FCP-NEXT: vpor %ymm9, %ymm15, %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 @@ -4721,13 +4721,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,u,4,4,4,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 @@ -4748,7 +4748,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,u,4,4,4] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,3,0,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 @@ -4779,7 +4779,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 @@ -4905,10 +4905,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] ; AVX512-NEXT: vpermd %zmm15, %zmm1, %zmm31 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] ; AVX512-NEXT: vpermi2d %zmm15, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm15 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] @@ -4968,7 +4968,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm5[0,0,1,1,4,4,5,5] ; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm9[0,0,1,1,4,4,5,5] ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512-NEXT: vpermd %zmm16, %zmm0, %zmm0 ; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9) @@ -5052,7 +5052,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm11 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm17 ; AVX512-FCP-NEXT: vporq %xmm8, %xmm11, %xmm29 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] @@ -5060,7 +5060,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm31, %ymm27 @@ -5144,13 +5144,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm28[0,0,1,1,4,4,5,5] ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm3[0,0,1,1,4,4,5,5] ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512-FCP-NEXT: vpermd %zmm11, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9) @@ -5263,10 +5263,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm15, %zmm1, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] ; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm15 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] @@ -5326,7 +5326,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm5[0,0,1,1,4,4,5,5] ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm9[0,0,1,1,4,4,5,5] ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] ; AVX512DQ-NEXT: vpermd %zmm16, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9) @@ -5410,7 +5410,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm17 ; AVX512DQ-FCP-NEXT: vporq %xmm8, %xmm11, %xmm29 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] @@ -5418,7 +5418,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm31, %ymm27 @@ -5502,13 +5502,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm28[0,0,1,1,4,4,5,5] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm3[0,0,1,1,4,4,5,5] ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9) @@ -5560,7 +5560,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovq %rax, %k4 ; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} ; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] ; AVX512BW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 ; AVX512BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-NEXT: kmovq %rax, %k2 @@ -5586,7 +5586,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] ; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [3,3,3,u,4,4,4,4] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4] ; AVX512BW-NEXT: vpermd %ymm25, %ymm12, %ymm17 ; AVX512BW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] @@ -5597,7 +5597,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-NEXT: kmovq %rax, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [3,3,3,3,u,4,4,4] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,3,3,3,0,4,4,4] ; AVX512BW-NEXT: vpermd %ymm16, %ymm13, %ymm17 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] @@ -5634,7 +5634,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] ; AVX512BW-NEXT: vpermd %zmm16, %zmm15, %zmm15 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm16 ; AVX512BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 @@ -5660,7 +5660,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm3, %zmm7, %zmm3 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -5731,7 +5731,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,0,1,1] ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,u,4,4,4,4] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,0,4,4,4,4] ; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm3, %ymm22 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] @@ -5743,7 +5743,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm24 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [1,1,2,2,2,2,2,2,27,27,27,27,0,28,28,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm24, %zmm15 ; AVX512BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -5769,7 +5769,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm13 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm24, %zmm23 ; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-FCP-NEXT: kmovq %rax, %k4 @@ -5799,7 +5799,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,0,1,1] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm25, %zmm17 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -5822,7 +5822,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm6 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm7 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -5845,7 +5845,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm18, %ymm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -5899,7 +5899,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovq %rax, %k4 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 @@ -5925,7 +5925,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4] ; AVX512DQ-BW-NEXT: vpermd %ymm25, %ymm12, %ymm17 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] @@ -5936,7 +5936,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [3,3,3,3,u,4,4,4] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,3,3,3,0,4,4,4] ; AVX512DQ-BW-NEXT: vpermd %ymm16, %ymm13, %ymm17 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] @@ -5973,7 +5973,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] ; AVX512DQ-BW-NEXT: vpermd %zmm16, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm16 ; AVX512DQ-BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 @@ -5999,7 +5999,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm3, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -6070,7 +6070,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,0,4,4,4,4] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm3, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] @@ -6082,7 +6082,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [1,1,2,2,2,2,2,2,27,27,27,27,0,28,28,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm24, %zmm15 ; AVX512DQ-BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -6108,7 +6108,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm13 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm24, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 @@ -6138,7 +6138,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm25, %zmm17 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -6161,7 +6161,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm6 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -6184,7 +6184,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm18, %ymm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 0ea5cf07f055a..88144e7880e35 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -710,7 +710,7 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] @@ -741,7 +741,7 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] @@ -772,7 +772,7 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] @@ -1269,7 +1269,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero @@ -1278,7 +1278,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero @@ -1287,7 +1287,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-NEXT: vmovdqa %ymm4, 32(%rax) @@ -1311,7 +1311,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero @@ -1320,7 +1320,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero @@ -1329,7 +1329,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rax) @@ -1353,7 +1353,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero @@ -1362,7 +1362,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero @@ -1371,7 +1371,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rax) @@ -1589,7 +1589,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u] @@ -1598,7 +1598,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -1675,7 +1675,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u] @@ -1684,7 +1684,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -2241,12 +2241,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm9 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm9 ; AVX2-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm12 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 @@ -2281,7 +2281,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] ; AVX2-NEXT: vmovdqa %ymm3, %ymm12 @@ -2297,7 +2297,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 ; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] @@ -2315,7 +2315,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] @@ -2326,7 +2326,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2377,12 +2377,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm11 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm9 @@ -2419,7 +2419,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] @@ -2431,7 +2431,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] @@ -2451,7 +2451,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] @@ -2462,7 +2462,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2513,12 +2513,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm9 @@ -2555,7 +2555,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] @@ -2567,7 +2567,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] @@ -2587,7 +2587,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] @@ -2598,7 +2598,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2792,7 +2792,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] @@ -2981,7 +2981,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] @@ -3029,7 +3029,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm8 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512BW-NEXT: vpermw %ymm8, %ymm13, %ymm8 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-NEXT: vprold $16, %xmm13, %xmm13 @@ -3046,14 +3046,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512BW-NEXT: vpermw %ymm16, %ymm17, %ymm16 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm7 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] ; AVX512BW-NEXT: vpermw %ymm14, %ymm16, %ymm14 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX512BW-NEXT: vprold $16, %ymm16, %ymm16 @@ -3062,23 +3062,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] ; AVX512BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512BW-NEXT: vpermw %ymm11, %ymm10, %ymm9 {%k1} ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] ; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vpermw %zmm10, %zmm11, %zmm9 {%k2} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15],ymm3[24],ymm5[24],ymm3[25],ymm5[25],ymm3[26],ymm5[26],ymm3[27],ymm5[27],ymm3[28],ymm5[28],ymm3[29],ymm5[29],ymm3[30],ymm5[30],ymm3[31],ymm5[31] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm11, %ymm12, %ymm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm10, %ymm12, %ymm11 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm10 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -3097,7 +3097,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 @@ -3124,9 +3124,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] ; AVX512BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm7, %ymm8 {%k2} @@ -3153,7 +3153,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512BW-FCP-NEXT: vpermw %ymm16, %ymm17, %ymm16 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -3161,30 +3161,30 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm6 {%k3} ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] ; AVX512BW-FCP-NEXT: vpermw %ymm16, %ymm17, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] ; AVX512BW-FCP-NEXT: vpermw %ymm14, %ymm17, %ymm16 {%k2} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] ; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512BW-FCP-NEXT: vpermw %ymm9, %ymm11, %ymm10 {%k1} ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] ; AVX512BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm11, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15],ymm3[24],ymm5[24],ymm3[25],ymm5[25],ymm3[26],ymm5[26],ymm3[27],ymm5[27],ymm3[28],ymm5[28],ymm3[29],ymm5[29],ymm3[30],ymm5[30],ymm3[31],ymm5[31] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm11, %ymm12, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm12, %ymm11 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -3203,7 +3203,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 @@ -3243,7 +3243,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm8 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm13, %ymm8 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-BW-NEXT: vprold $16, %xmm13, %xmm13 @@ -3260,14 +3260,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm17, %ymm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] ; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm16, %ymm14 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX512DQ-BW-NEXT: vprold $16, %ymm16, %ymm16 @@ -3276,23 +3276,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] ; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512DQ-BW-NEXT: vpermw %ymm11, %ymm10, %ymm9 {%k1} ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] ; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vpermw %zmm10, %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15],ymm3[24],ymm5[24],ymm3[25],ymm5[25],ymm3[26],ymm5[26],ymm3[27],ymm5[27],ymm3[28],ymm5[28],ymm3[29],ymm5[29],ymm3[30],ymm5[30],ymm3[31],ymm5[31] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm11, %ymm12, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm12, %ymm11 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -3311,7 +3311,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 @@ -3338,9 +3338,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] ; AVX512DQ-BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm7, %ymm8 {%k2} @@ -3367,7 +3367,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm16, %ymm17, %ymm16 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -3375,30 +3375,30 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm6 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm16, %ymm17, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm14, %ymm17, %ymm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm9, %ymm11, %ymm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] ; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15],ymm3[24],ymm5[24],ymm3[25],ymm5[25],ymm3[26],ymm5[26],ymm3[27],ymm5[27],ymm3[28],ymm5[28],ymm3[29],ymm5[29],ymm3[30],ymm5[30],ymm3[31],ymm5[31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm11, %ymm12, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm12, %ymm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -3417,7 +3417,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 @@ -4463,7 +4463,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm5 @@ -4509,11 +4509,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm5 ; AVX2-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4569,7 +4569,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] @@ -4608,11 +4608,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [2,1,0,3,4,4,4,4] ; AVX2-NEXT: vmovdqa %xmm7, %xmm14 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 @@ -4663,7 +4663,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -4699,10 +4699,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] @@ -4788,7 +4788,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm14 @@ -4838,10 +4838,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 @@ -4896,7 +4896,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -4932,11 +4932,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] ; AVX2-FP-NEXT: vmovdqa %xmm7, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 @@ -4988,7 +4988,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload @@ -5025,10 +5025,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [2,1,0,3,4,4,4,4] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] @@ -5113,7 +5113,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm14 @@ -5163,10 +5163,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 @@ -5221,7 +5221,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -5257,11 +5257,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] ; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm15 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 @@ -5313,7 +5313,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload @@ -5350,10 +5350,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [2,1,0,3,4,4,4,4] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] @@ -5831,7 +5831,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3 @@ -6395,7 +6395,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3 @@ -6550,14 +6550,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512BW-NEXT: vpermw %zmm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512BW-NEXT: vpermw %ymm5, %ymm20, %ymm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm18 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm19 @@ -6571,7 +6571,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm5 ; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm13 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512BW-NEXT: vpermw %ymm6, %ymm23, %ymm6 ; AVX512BW-NEXT: vmovdqa64 (%r8), %ymm21 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] @@ -6630,7 +6630,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm23[0],ymm20[0],ymm23[1],ymm20[1],ymm23[2],ymm20[2],ymm23[3],ymm20[3],ymm23[4],ymm20[4],ymm23[5],ymm20[5],ymm23[6],ymm20[6],ymm23[7],ymm20[7],ymm23[16],ymm20[16],ymm23[17],ymm20[17],ymm23[18],ymm20[18],ymm23[19],ymm20[19],ymm23[20],ymm20[20],ymm23[21],ymm20[21],ymm23[22],ymm20[22],ymm23[23],ymm20[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm23 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm25 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm23, %ymm25, %ymm23 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm27 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] @@ -6639,7 +6639,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm28, %ymm29, %ymm28 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k1} @@ -6684,7 +6684,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512BW-NEXT: vpermw %ymm9, %ymm11, %ymm9 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 ; AVX512BW-NEXT: vpshufb %xmm23, %xmm8, %xmm14 @@ -6699,7 +6699,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] ; AVX512BW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -6756,7 +6756,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm9, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm6 @@ -6767,7 +6767,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm12, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 @@ -6825,7 +6825,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] ; AVX512BW-FCP-NEXT: vpermw %ymm12, %ymm24, %ymm12 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm18 @@ -6836,16 +6836,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512BW-FCP-NEXT: vpermw %ymm16, %ymm26, %ymm16 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm12, %xmm16 ; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero,xmm12[4],zero,xmm12[5],zero,xmm12[6],zero,xmm12[7],zero ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] ; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] @@ -6890,9 +6890,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512BW-FCP-NEXT: vpermw %zmm11, %zmm14, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] ; AVX512BW-FCP-NEXT: vpermw %zmm19, %zmm20, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] @@ -6903,7 +6903,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm20, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512BW-FCP-NEXT: vpermw %ymm4, %ymm6, %ymm4 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] @@ -6959,14 +6959,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] ; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm20, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm18 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm19 @@ -6980,7 +6980,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm13 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm6, %ymm23, %ymm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %ymm21 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] @@ -7039,7 +7039,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm23[0],ymm20[0],ymm23[1],ymm20[1],ymm23[2],ymm20[2],ymm23[3],ymm20[3],ymm23[4],ymm20[4],ymm23[5],ymm20[5],ymm23[6],ymm20[6],ymm23[7],ymm20[7],ymm23[16],ymm20[16],ymm23[17],ymm20[17],ymm23[18],ymm20[18],ymm23[19],ymm20[19],ymm23[20],ymm20[20],ymm23[21],ymm20[21],ymm23[22],ymm20[22],ymm23[23],ymm20[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm23 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm25 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm25, %ymm23 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm27 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] @@ -7048,7 +7048,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm28, %ymm29, %ymm28 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k1} @@ -7093,7 +7093,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm11, %ymm9 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 ; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm8, %xmm14 @@ -7108,7 +7108,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] ; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] ; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -7165,7 +7165,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm9, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm6 @@ -7176,7 +7176,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm12, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 @@ -7234,7 +7234,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm12, %ymm24, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm18 @@ -7245,16 +7245,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm16, %ymm26, %ymm16 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm12, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero,xmm12[4],zero,xmm12[5],zero,xmm12[6],zero,xmm12[7],zero ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] @@ -7299,9 +7299,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm11, %zmm14, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm19, %zmm20, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] @@ -7312,7 +7312,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm20, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm4, %ymm6, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 2eb9f5e8c6e27..0495e240ba968 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1005,7 +1005,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 @@ -1082,13 +1082,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1165,13 +1165,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1244,13 +1244,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] ; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 @@ -1324,13 +1324,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 @@ -1988,7 +1988,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] @@ -2180,7 +2180,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,1,u,1,u,0,0,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] @@ -2329,7 +2329,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,1,u,1,u,0,0,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] @@ -2394,7 +2394,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] @@ -2476,7 +2476,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] ; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] @@ -2552,7 +2552,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] @@ -2634,7 +2634,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] @@ -3593,7 +3593,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm2 = [16777216,197120] ; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] @@ -4079,7 +4079,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] @@ -4101,7 +4101,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -4198,7 +4198,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r10), %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] ; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero @@ -4371,7 +4371,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm10 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,1,u,1,u,0,0,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] @@ -4452,8 +4452,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vporq %zmm0, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,5,4,0,5,0,4,0] ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -4500,7 +4499,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r10), %ymm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero @@ -4673,7 +4672,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm10 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,1,u,1,u,0,0,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] @@ -4754,8 +4753,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,5,4,0,5,0,4,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -4863,7 +4861,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-NEXT: kmovq %rcx, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -4900,7 +4898,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k2 @@ -5012,7 +5010,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -5049,7 +5047,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 @@ -5165,7 +5163,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -5202,7 +5200,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 @@ -5314,7 +5312,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -5351,7 +5349,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 @@ -7192,7 +7190,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm10 = [218890240,986624] ; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] @@ -7224,7 +7222,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm15 = [16777216,197120] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8365,7 +8363,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 @@ -8604,7 +8602,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 @@ -8781,7 +8779,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%rax), %ymm1 ; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm16 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm3 @@ -8805,7 +8803,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] ; AVX512-NEXT: vmovdqa 32(%rax), %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,6] @@ -9371,7 +9369,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm16 @@ -9476,7 +9474,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: # zmm18 = zmm1[0,1,0,1],mem[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,2,0,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,0,1,2,0,0,1] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm19 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[1,1,0,0,4,5,6,7] @@ -9500,7 +9498,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm14 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,5,7,4,5] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,4,5,5,7,4,5] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm20 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] @@ -9651,7 +9649,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm16 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm3 @@ -9675,7 +9673,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,6] @@ -10241,7 +10239,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm16 @@ -10346,7 +10344,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: # zmm18 = zmm1[0,1,0,1],mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,2,0,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,0,1,2,0,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[1,1,0,0,4,5,6,7] @@ -10370,7 +10368,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,4,5,5,7,4,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm20 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] @@ -10536,7 +10534,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm11 ; AVX512BW-NEXT: vmovdqa64 32(%rax), %ymm17 ; AVX512BW-NEXT: vpshufb %ymm27, %ymm17, %ymm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-NEXT: vpermw %ymm17, %ymm26, %ymm27 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm27, %zmm22, %zmm22 ; AVX512BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 @@ -10589,7 +10587,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovq %r10, %k3 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] ; AVX512BW-NEXT: vpermi2w %zmm22, %zmm17, %zmm0 ; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-NEXT: kmovq %rax, %k5 @@ -10661,7 +10659,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] ; AVX512BW-NEXT: vpermw %zmm22, %zmm11, %zmm11 ; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -10690,7 +10688,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm21, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] ; AVX512BW-NEXT: vpermi2w %zmm22, %zmm17, %zmm25 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] ; AVX512BW-NEXT: vpshufb %xmm17, %xmm23, %xmm22 @@ -10728,7 +10726,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-NEXT: vpermw %zmm13, %zmm1, %zmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 @@ -10860,7 +10858,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vporq %ymm1, %ymm22, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm31 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] ; AVX512BW-FCP-NEXT: vpermw %ymm31, %ymm22, %ymm22 ; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm31, %ymm25 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm25, %zmm22 @@ -10904,7 +10902,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %r10, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm10, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -10946,7 +10944,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] ; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 @@ -10981,7 +10979,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] ; AVX512BW-FCP-NEXT: vpermi2w %zmm10, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 @@ -11017,7 +11015,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm10, %zmm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 @@ -11047,7 +11045,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -11169,7 +11167,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %ymm17 ; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm17, %ymm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm26, %ymm27 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm27, %zmm22, %zmm22 ; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 @@ -11222,7 +11220,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovq %r10, %k3 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm15 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm22, %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512DQ-BW-NEXT: kmovq %rax, %k5 @@ -11294,7 +11292,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] ; AVX512DQ-BW-NEXT: vpermw %zmm22, %zmm11, %zmm11 ; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -11323,7 +11321,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm21, %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] ; AVX512DQ-BW-NEXT: vpermi2w %zmm22, %zmm17, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm23, %xmm22 @@ -11361,7 +11359,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-NEXT: vpermw %zmm13, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 @@ -11493,7 +11491,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vporq %ymm1, %ymm22, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm31, %ymm22, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm31, %ymm25 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm25, %zmm22 @@ -11537,7 +11535,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm10, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -11579,7 +11577,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 @@ -11614,7 +11612,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm10, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 @@ -11650,7 +11648,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm10, %zmm2 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 @@ -11680,7 +11678,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 523132bc1436e..dd3f96322bcc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -437,7 +437,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -460,7 +460,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vzeroupper @@ -483,7 +483,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -506,7 +506,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -529,7 +529,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -552,7 +552,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -575,7 +575,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -598,7 +598,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -621,7 +621,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -644,7 +644,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -667,7 +667,7 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -957,7 +957,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] @@ -1037,7 +1037,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] @@ -1117,7 +1117,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] @@ -1192,10 +1192,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA @@ -1265,10 +1265,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA @@ -1626,25 +1626,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] ; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] ; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1690,25 +1690,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1754,25 +1754,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1818,25 +1818,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] @@ -1882,25 +1882,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] @@ -1946,25 +1946,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] @@ -2010,25 +2010,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] @@ -2074,7 +2074,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} zmm5 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6] @@ -2085,11 +2085,11 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxwq {{.*#+}} zmm8 = [2048,2305,2562,2819,3076,3333,3590,3847] ; AVX512BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] ; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512BW-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10 ; AVX512BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 ; AVX512BW-NEXT: movl $2228258, %ecx # imm = 0x220022 ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -2128,23 +2128,23 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512BW-FCP-NEXT: vpshufb %zmm7, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512BW-FCP-NEXT: vpmovsxwd {{.*#+}} zmm9 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] ; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: movl $8913032, %ecx # imm = 0x880088 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} zmm10 = [2048,2305,2562,2819,3076,3333,3590,3847] ; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movl $2228258, %ecx # imm = 0x220022 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -2187,7 +2187,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512DQ-BW-NEXT: vpmovsxwd {{.*#+}} zmm5 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6] @@ -2198,11 +2198,11 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxwq {{.*#+}} zmm8 = [2048,2305,2562,2819,3076,3333,3590,3847] ; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10 ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 ; AVX512DQ-BW-NEXT: movl $2228258, %ecx # imm = 0x220022 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -2241,23 +2241,23 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm7, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxwd {{.*#+}} zmm9 = [0,2048,0,2305,0,2562,0,2819,0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $8913032, %ecx # imm = 0x880088 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} zmm10 = [2048,2305,2562,2819,3076,3333,3590,3847] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $2228258, %ecx # imm = 0x220022 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -3264,7 +3264,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] @@ -3332,7 +3332,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -3417,7 +3417,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] @@ -3485,7 +3485,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -3788,7 +3788,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm3 = [2312,2826,3340,3854] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] @@ -3799,7 +3799,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm9 = [1284,1798] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] @@ -4146,14 +4146,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm4 = [1284,1798] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm10 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero @@ -4306,7 +4306,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm14 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm15 @@ -4328,7 +4328,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA @@ -4418,7 +4418,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm10 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm5 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm12 @@ -4432,7 +4432,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm19 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4494,7 +4494,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm14 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm15 @@ -4516,7 +4516,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA @@ -4606,7 +4606,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm12 @@ -4620,7 +4620,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -6485,7 +6485,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm13 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -6528,7 +6528,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -6544,7 +6544,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 @@ -6595,7 +6595,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] @@ -6611,7 +6611,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 @@ -6664,7 +6664,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] @@ -6680,7 +6680,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 @@ -6809,7 +6809,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm13 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -6852,7 +6852,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -6868,7 +6868,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 @@ -6919,7 +6919,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] @@ -6935,7 +6935,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 @@ -6988,7 +6988,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] @@ -7004,7 +7004,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 @@ -7729,12 +7729,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] @@ -8502,12 +8502,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] @@ -8661,9 +8661,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm24 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] ; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] ; AVX512BW-NEXT: vpermw %zmm1, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] ; AVX512BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} @@ -8688,7 +8688,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm11 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] ; AVX512BW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] ; AVX512BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} @@ -8909,9 +8909,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm28 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] ; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm26 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] ; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm27 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 @@ -9127,9 +9127,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm24 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] ; AVX512DQ-BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} @@ -9154,7 +9154,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm11 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] ; AVX512DQ-BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} @@ -9375,9 +9375,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm28 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm27 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 0fe759f3c4310..5c57045fbc226 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -264,7 +264,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; SSE4: # %bb.0: ; SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8] +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8] ; SSE4-NEXT: pmullw %xmm2, %xmm0 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pand %xmm3, %xmm0 @@ -881,31 +881,42 @@ define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind { ; define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_17_65: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_17_65: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_17_65: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,65] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; SSE4-LABEL: mul_v2i64_17_65: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [17,65] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pmuludq %xmm1, %xmm2 +; SSE4-NEXT: psrlq $32, %xmm0 +; SSE4-NEXT: pmuludq %xmm1, %xmm0 +; SSE4-NEXT: psllq $32, %xmm0 +; SSE4-NEXT: paddq %xmm2, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: mul_v2i64_17_65: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,65] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlq $32, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_17_65: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -915,7 +926,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_17_65: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1295,31 +1306,42 @@ define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind { ; define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_15_63: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_15_63: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_15_63: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,63] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; SSE4-LABEL: mul_v2i64_15_63: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,63] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pmuludq %xmm1, %xmm2 +; SSE4-NEXT: psrlq $32, %xmm0 +; SSE4-NEXT: pmuludq %xmm1, %xmm0 +; SSE4-NEXT: psllq $32, %xmm0 +; SSE4-NEXT: paddq %xmm2, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: mul_v2i64_15_63: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,63] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlq $32, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_15_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1329,7 +1351,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_15_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1346,37 +1368,65 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_neg_15_63: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm2 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] -; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE-NEXT: paddq %xmm1, %xmm2 -; X86-SSE-NEXT: psllq $32, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_neg_15_63: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE2-NEXT: paddq %xmm1, %xmm2 +; X86-SSE2-NEXT: psllq $32, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_neg_15_63: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: paddq %xmm3, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; X86-SSE4-LABEL: mul_v2i64_neg_15_63: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlq $32, %xmm2 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: psllq $32, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE2-LABEL: mul_v2i64_neg_15_63: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlq $32, %xmm3 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: paddq %xmm3, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq +; +; X64-SSE4-LABEL: mul_v2i64_neg_15_63: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: psrlq $32, %xmm3 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE4-NEXT: paddq %xmm3, %xmm0 +; X64-SSE4-NEXT: psllq $32, %xmm0 +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_15_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1388,7 +1438,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_15_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1407,37 +1457,65 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_neg_17_65: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm2 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] -; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE-NEXT: paddq %xmm1, %xmm2 -; X86-SSE-NEXT: psllq $32, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_neg_17_65: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE2-NEXT: paddq %xmm1, %xmm2 +; X86-SSE2-NEXT: psllq $32, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_neg_17_65: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: paddq %xmm3, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; X86-SSE4-LABEL: mul_v2i64_neg_17_65: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: psrlq $32, %xmm2 +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE4-NEXT: paddq %xmm1, %xmm2 +; X86-SSE4-NEXT: psllq $32, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE2-LABEL: mul_v2i64_neg_17_65: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlq $32, %xmm3 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: paddq %xmm3, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq +; +; X64-SSE4-LABEL: mul_v2i64_neg_17_65: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: psrlq $32, %xmm3 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE4-NEXT: paddq %xmm3, %xmm0 +; X64-SSE4-NEXT: psllq $32, %xmm0 +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_17_65: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1449,7 +1527,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_17_65: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1497,37 +1575,65 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_neg_0_1: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_neg_0_1: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlq $32, %xmm3 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: paddq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_neg_0_1: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: paddq %xmm3, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; X86-SSE4-LABEL: mul_v2i64_neg_0_1: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: psrlq $32, %xmm3 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: psllq $32, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE2-LABEL: mul_v2i64_neg_0_1: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlq $32, %xmm3 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: paddq %xmm3, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq +; +; X64-SSE4-LABEL: mul_v2i64_neg_0_1: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: psrlq $32, %xmm3 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE4-NEXT: paddq %xmm3, %xmm0 +; X64-SSE4-NEXT: psllq $32, %xmm0 +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_0_1: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1539,7 +1645,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_0_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1558,37 +1664,65 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { -; X86-SSE-LABEL: mul_v2i64_15_neg_63: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_15_neg_63: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psrlq $32, %xmm3 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: paddq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_15_neg_63: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: paddq %xmm3, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; X86-SSE4-LABEL: mul_v2i64_15_neg_63: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] +; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE4-NEXT: psrlq $32, %xmm3 +; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE4-NEXT: paddq %xmm3, %xmm0 +; X86-SSE4-NEXT: psllq $32, %xmm0 +; X86-SSE4-NEXT: paddq %xmm2, %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE2-LABEL: mul_v2i64_15_neg_63: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psrlq $32, %xmm3 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: paddq %xmm3, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq +; +; X64-SSE4-LABEL: mul_v2i64_15_neg_63: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE4-NEXT: psrlq $32, %xmm3 +; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE4-NEXT: paddq %xmm3, %xmm0 +; X64-SSE4-NEXT: psllq $32, %xmm0 +; X64-SSE4-NEXT: paddq %xmm2, %xmm0 +; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_15_neg_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1600,7 +1734,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_15_neg_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1695,7 +1829,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; SSE4: # %bb.0: ; SSE4-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE4-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] ; SSE4-NEXT: pmullw %xmm2, %xmm0 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pand %xmm3, %xmm0 @@ -1708,7 +1842,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] +; X64-XOP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -1738,31 +1872,42 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> } define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { -; X86-SSE-LABEL: mul_v2i64_68_132: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_68_132: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_68_132: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,132] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; SSE4-LABEL: mul_v2i64_68_132: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxwq {{.*#+}} xmm1 = [68,132] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pmuludq %xmm1, %xmm2 +; SSE4-NEXT: psrlq $32, %xmm0 +; SSE4-NEXT: pmuludq %xmm1, %xmm0 +; SSE4-NEXT: psllq $32, %xmm0 +; SSE4-NEXT: paddq %xmm2, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: mul_v2i64_68_132: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,132] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlq $32, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_68_132: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132] +; X64-XOP-NEXT: vpmovsxwq {{.*#+}} xmm1 = [68,132] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1772,7 +1917,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_68_132: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132] +; X64-AVX2-NEXT: vpmovsxwq {{.*#+}} xmm1 = [68,132] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1789,31 +1934,42 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { } define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { -; X86-SSE-LABEL: mul_v2i64_60_120: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: mul_v2i64_60_120: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq $32, %xmm0 +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: psllq $32, %xmm0 +; X86-SSE2-NEXT: paddq %xmm2, %xmm0 +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: mul_v2i64_60_120: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,124] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: retq +; SSE4-LABEL: mul_v2i64_60_120: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [60,124] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: pmuludq %xmm1, %xmm2 +; SSE4-NEXT: psrlq $32, %xmm0 +; SSE4-NEXT: pmuludq %xmm1, %xmm0 +; SSE4-NEXT: psllq $32, %xmm0 +; SSE4-NEXT: paddq %xmm2, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: mul_v2i64_60_120: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,124] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE2-NEXT: psrlq $32, %xmm0 +; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: psllq $32, %xmm0 +; X64-SSE2-NEXT: paddq %xmm2, %xmm0 +; X64-SSE2-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_60_120: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124] +; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1823,7 +1979,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_60_120: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124] +; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll index 0efe80dc94dad..aeab2a1931c2f 100644 --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -9,9 +9,9 @@ define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1 @@ -29,9 +29,9 @@ define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1 @@ -52,9 +52,9 @@ define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -75,9 +75,9 @@ define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 @@ -98,9 +98,9 @@ define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -121,9 +121,9 @@ define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 @@ -147,7 +147,7 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = ashr <16 x i32> %a0, @@ -167,7 +167,7 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = lshr <16 x i32> %a0, @@ -194,7 +194,7 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -206,7 +206,7 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = ashr <32 x i16> %a0, @@ -233,7 +233,7 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -245,7 +245,7 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = lshr <32 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll index 364dc185d26c2..8aafec7427b4f 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -1856,7 +1856,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2174,7 +2174,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2492,7 +2492,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2810,7 +2810,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3128,7 +3128,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3446,7 +3446,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3764,7 +3764,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9,9,9,9,9,9,9,9] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [9,9,9,9,9,9,9,9] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4082,7 +4082,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10,10,10,10,10] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [10,10,10,10,10,10,10,10] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4400,7 +4400,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [11,11,11,11,11,11,11,11] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [11,11,11,11,11,11,11,11] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4718,7 +4718,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12,12,12,12,12] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [12,12,12,12,12,12,12,12] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5036,7 +5036,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [13,13,13,13,13,13,13,13] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [13,13,13,13,13,13,13,13] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5354,7 +5354,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [14,14,14,14,14,14,14,14] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [14,14,14,14,14,14,14,14] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5672,7 +5672,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6174,7 +6174,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [3,3,3,3] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6548,7 +6548,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [4,4,4,4] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6922,7 +6922,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [5,5,5,5] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -7296,7 +7296,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [6,6,6,6] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -7670,7 +7670,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [7,7,7,7] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8044,7 +8044,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [8,8,8,8] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8418,7 +8418,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9,9,9,9] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [9,9,9,9] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8792,7 +8792,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [10,10,10,10] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9166,7 +9166,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [11,11,11,11] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [11,11,11,11] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9540,7 +9540,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [12,12,12,12] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9914,7 +9914,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [13,13,13,13] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [13,13,13,13] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -10288,7 +10288,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [14,14,14,14] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [14,14,14,14] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -10662,7 +10662,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [15,15,15,15] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11036,7 +11036,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [16,16,16,16] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11410,7 +11410,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [17,17,17,17] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [17,17,17,17] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11784,7 +11784,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18,18,18,18] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [18,18,18,18] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12158,7 +12158,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,19,19,19] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [19,19,19,19] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12532,7 +12532,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [20,20,20,20] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [20,20,20,20] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12906,7 +12906,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [21,21,21,21] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [21,21,21,21] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -13280,7 +13280,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [22,22,22,22] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [22,22,22,22] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -13654,7 +13654,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [23,23,23,23] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [23,23,23,23] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14028,7 +14028,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [24,24,24,24] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [24,24,24,24] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14402,7 +14402,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [25,25,25,25] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14776,7 +14776,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [26,26,26,26] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15150,7 +15150,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [27,27,27,27] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [27,27,27,27] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15524,7 +15524,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [28,28,28,28] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [28,28,28,28] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15898,7 +15898,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [29,29,29,29] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [29,29,29,29] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -16272,7 +16272,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [30,30,30,30] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [30,30,30,30] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -16646,7 +16646,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [31,31,31,31] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [31,31,31,31] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -16917,7 +16917,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17207,8 +17207,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17224,7 +17223,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17232,7 +17231,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17250,7 +17249,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17524,8 +17523,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17541,7 +17539,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17549,7 +17547,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17567,7 +17565,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17841,8 +17839,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -17858,7 +17855,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17866,7 +17863,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17884,7 +17881,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18158,8 +18155,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18175,7 +18171,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18183,7 +18179,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18201,7 +18197,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18475,8 +18471,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18492,7 +18487,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18500,7 +18495,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18518,7 +18513,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18792,8 +18787,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -18809,7 +18803,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18817,7 +18811,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18835,7 +18829,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19109,8 +19103,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19126,7 +19119,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19134,7 +19127,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19152,7 +19145,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19426,8 +19419,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19443,7 +19435,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19451,7 +19443,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19469,7 +19461,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19743,8 +19735,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -19760,7 +19751,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19768,7 +19759,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19786,7 +19777,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20060,8 +20051,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20077,7 +20067,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20085,7 +20075,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20103,7 +20093,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20377,8 +20367,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20394,7 +20383,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20402,7 +20391,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20420,7 +20409,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20694,8 +20683,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -20711,7 +20699,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20719,7 +20707,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20737,7 +20725,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21011,8 +20999,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21028,7 +21015,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21036,7 +21023,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21054,7 +21041,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21328,8 +21315,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21345,7 +21331,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21353,7 +21339,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21371,7 +21357,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21645,8 +21631,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21662,7 +21647,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21670,7 +21655,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21688,7 +21673,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21962,8 +21947,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -21979,7 +21963,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21987,7 +21971,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22005,7 +21989,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22279,8 +22263,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22296,7 +22279,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22304,7 +22287,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22322,7 +22305,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22596,8 +22579,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22613,7 +22595,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22621,7 +22603,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22639,7 +22621,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22913,8 +22895,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -22930,7 +22911,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22938,7 +22919,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22956,7 +22937,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23230,8 +23211,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23247,7 +23227,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23255,7 +23235,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23273,7 +23253,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23547,8 +23527,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23564,7 +23543,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23572,7 +23551,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23590,7 +23569,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23864,8 +23843,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -23881,7 +23859,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23889,7 +23867,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23907,7 +23885,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24181,8 +24159,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24198,7 +24175,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24206,7 +24183,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24224,7 +24201,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24498,8 +24475,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24515,7 +24491,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24523,7 +24499,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24541,7 +24517,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24815,8 +24791,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -24832,7 +24807,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24840,7 +24815,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24858,7 +24833,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25132,8 +25107,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25149,7 +25123,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25157,7 +25131,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25175,7 +25149,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25449,8 +25423,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25466,7 +25439,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25474,7 +25447,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25492,7 +25465,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25766,8 +25739,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -25783,7 +25755,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25791,7 +25763,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25809,7 +25781,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26083,8 +26055,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26100,7 +26071,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26108,7 +26079,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26126,7 +26097,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26400,8 +26371,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26417,7 +26387,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26425,7 +26395,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26443,7 +26413,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26717,8 +26687,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -26734,7 +26703,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26742,7 +26711,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26760,7 +26729,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27034,8 +27003,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27051,7 +27019,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27059,7 +27027,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27077,7 +27045,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27351,8 +27319,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27368,7 +27335,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27376,7 +27343,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27394,7 +27361,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27668,8 +27635,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -27685,7 +27651,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27693,7 +27659,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27711,7 +27677,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27985,8 +27951,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28002,7 +27967,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28010,7 +27975,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28028,7 +27993,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28302,8 +28267,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28319,7 +28283,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28327,7 +28291,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28345,7 +28309,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28619,8 +28583,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28636,7 +28599,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28644,7 +28607,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28662,7 +28625,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28936,8 +28899,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -28953,7 +28915,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28961,7 +28923,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28979,7 +28941,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29253,8 +29215,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29270,7 +29231,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29278,7 +29239,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29296,7 +29257,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29570,8 +29531,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29587,7 +29547,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29595,7 +29555,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29613,7 +29573,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29887,8 +29847,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -29904,7 +29863,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29912,7 +29871,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29930,7 +29889,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30204,8 +30163,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30221,7 +30179,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30229,7 +30187,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30247,7 +30205,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30521,8 +30479,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30538,7 +30495,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30546,7 +30503,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30564,7 +30521,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30838,8 +30795,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -30855,7 +30811,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30863,7 +30819,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30881,7 +30837,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31155,8 +31111,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31172,7 +31127,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31180,7 +31135,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31198,7 +31153,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31472,8 +31427,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31489,7 +31443,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31497,7 +31451,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31515,7 +31469,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31789,8 +31743,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -31806,7 +31759,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31814,7 +31767,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31832,7 +31785,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32106,8 +32059,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32123,7 +32075,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32131,7 +32083,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32149,7 +32101,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32423,8 +32375,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32440,7 +32391,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32448,7 +32399,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32466,7 +32417,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32740,8 +32691,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -32757,7 +32707,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32765,7 +32715,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32783,7 +32733,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33057,8 +33007,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33074,7 +33023,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33082,7 +33031,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33100,7 +33049,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33374,8 +33323,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33391,7 +33339,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33399,7 +33347,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33417,7 +33365,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33691,8 +33639,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -33708,7 +33655,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33716,7 +33663,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33734,7 +33681,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34008,8 +33955,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34025,7 +33971,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34033,7 +33979,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34051,7 +33997,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34325,8 +34271,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34342,7 +34287,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34350,7 +34295,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34368,7 +34313,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34642,8 +34587,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34659,7 +34603,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34667,7 +34611,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34685,7 +34629,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34959,8 +34903,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -34976,7 +34919,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34984,7 +34927,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35002,7 +34945,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35276,8 +35219,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35293,7 +35235,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35301,7 +35243,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35319,7 +35261,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35593,8 +35535,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35610,7 +35551,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35618,7 +35559,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35636,7 +35577,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35910,8 +35851,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -35927,7 +35867,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35935,7 +35875,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35953,7 +35893,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36227,8 +36167,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -36244,7 +36183,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36252,7 +36191,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36270,7 +36209,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index c1a248fadd9c7..05854ff728a07 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -9554,8 +9554,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9638,8 +9637,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9722,8 +9720,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9806,8 +9803,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9890,8 +9886,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9974,8 +9969,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10058,8 +10052,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10142,8 +10135,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10226,8 +10218,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10310,8 +10301,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10394,8 +10384,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10478,8 +10467,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10562,8 +10550,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10646,8 +10633,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10730,8 +10716,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10814,8 +10799,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10898,8 +10882,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10982,8 +10965,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11066,8 +11048,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11150,8 +11131,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11234,8 +11214,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11318,8 +11297,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11402,8 +11380,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11486,8 +11463,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11570,8 +11546,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11654,8 +11629,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11738,8 +11712,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11822,8 +11795,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11906,8 +11878,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11990,8 +11961,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12074,8 +12044,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12158,8 +12127,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12242,8 +12210,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12326,8 +12293,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12410,8 +12376,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12494,8 +12459,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12578,8 +12542,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12662,8 +12625,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12746,8 +12708,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12830,8 +12791,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12914,8 +12874,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12998,8 +12957,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13082,8 +13040,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13166,8 +13123,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13250,8 +13206,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13334,8 +13289,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13418,8 +13372,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13502,8 +13455,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13586,8 +13538,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13670,8 +13621,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13754,8 +13704,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13838,8 +13787,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13922,8 +13870,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14006,8 +13953,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14090,8 +14036,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14174,8 +14119,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14258,8 +14202,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14342,8 +14285,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14426,8 +14368,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14510,8 +14451,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14594,8 +14534,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14678,8 +14617,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14762,8 +14700,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14846,8 +14783,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14930,8 +14866,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15014,8 +14949,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15098,8 +15032,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15182,8 +15115,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15266,8 +15198,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15350,8 +15281,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15434,8 +15364,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15518,8 +15447,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15602,8 +15530,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15686,8 +15613,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15770,8 +15696,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15854,8 +15779,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15938,8 +15862,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16022,8 +15945,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16106,8 +16028,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16190,8 +16111,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16274,8 +16194,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16358,8 +16277,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16442,8 +16360,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16526,8 +16443,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16610,8 +16526,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16694,8 +16609,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16778,8 +16692,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16862,8 +16775,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16946,8 +16858,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17030,8 +16941,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17114,8 +17024,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17198,8 +17107,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17282,8 +17190,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17366,8 +17273,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17450,8 +17356,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17534,8 +17439,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17618,8 +17522,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17702,8 +17605,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17786,8 +17688,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17870,8 +17771,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17954,8 +17854,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18038,8 +17937,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18122,8 +18020,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18206,8 +18103,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18290,8 +18186,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18374,8 +18269,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18458,8 +18352,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18542,8 +18435,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18626,8 +18518,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18710,8 +18601,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18794,8 +18684,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18878,8 +18767,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18962,8 +18850,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19046,8 +18933,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19130,8 +19016,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19214,8 +19099,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19298,8 +19182,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19382,8 +19265,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19466,8 +19348,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19550,8 +19431,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19634,8 +19514,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19718,8 +19597,7 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 8056b9a2963c3..1c204333a0335 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -232,7 +232,7 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) { ; ; SSE41-LABEL: test_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [1,1] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm8 = [1,1] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: paddq %xmm5, %xmm1 @@ -560,7 +560,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; SSE41-LABEL: test_v16i32_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm4 = [255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: paddd %xmm2, %xmm0 @@ -669,7 +669,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; SSE41-LABEL: test_v32i32_v32i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [255,255,255,255] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: paddd %xmm5, %xmm1 @@ -1193,7 +1193,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) { ; ; SSE41-LABEL: test_v64i16_v64i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packuswb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll index 401118af26259..aced5e0290b0d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -1121,8 +1121,7 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VL-NEXT: vpsadbw %ymm0, %ymm6, %ymm6 ; AVX512VL-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsadbw %ymm0, %ymm4, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12] -; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,8,12] ; AVX512VL-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 @@ -1165,8 +1164,7 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VPOPCNT-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 ; AVX512VPOPCNT-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512VPOPCNT-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12] -; AVX512VPOPCNT-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,4,8,12] ; AVX512VPOPCNT-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 ; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512VPOPCNT-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 300e0ae81e08c..3d33a51a2821c 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -658,7 +658,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm4 = [255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a3d3ed6a4b243..671ef61fd3c9c 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -98,7 +98,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -110,7 +110,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -122,7 +122,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -142,10 +142,10 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -159,10 +159,10 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -176,7 +176,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -200,10 +200,10 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -227,10 +227,10 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -306,11 +306,11 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -355,11 +355,11 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 @@ -481,7 +481,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: movb $63, %al @@ -498,7 +498,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movb $63, %al @@ -516,7 +516,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512BW-NEXT: movb $63, %al @@ -542,7 +542,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF @@ -558,7 +558,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF @@ -574,7 +574,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF @@ -599,7 +599,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -609,7 +609,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -623,7 +623,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 @@ -632,7 +632,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -646,7 +646,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF @@ -673,7 +673,7 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -681,10 +681,10 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -700,7 +700,7 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -708,10 +708,10 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -727,15 +727,15 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -758,7 +758,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} @@ -766,10 +766,10 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -799,7 +799,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k1 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 @@ -807,10 +807,10 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -1308,7 +1308,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -1322,9 +1322,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 @@ -1377,7 +1377,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -1391,9 +1391,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 @@ -2350,7 +2350,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2374,7 +2374,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2400,7 +2400,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2426,7 +2426,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VBMI-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2447,7 +2447,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2459,7 +2459,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2471,7 +2471,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2491,10 +2491,10 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -2508,10 +2508,10 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -2525,7 +2525,7 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2548,16 +2548,16 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -2575,16 +2575,16 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -2653,17 +2653,17 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -2700,17 +2700,17 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -2830,13 +2830,13 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11 @@ -2911,13 +2911,13 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11 @@ -3121,7 +3121,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3138,7 +3138,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3155,7 +3155,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3186,7 +3186,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: movw $15, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3205,7 +3205,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: movw $15, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3219,7 +3219,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF @@ -3246,7 +3246,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -3254,12 +3254,12 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -3275,7 +3275,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 @@ -3283,11 +3283,11 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3354,7 +3354,7 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -3362,16 +3362,16 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3391,7 +3391,7 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -3399,16 +3399,16 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3428,23 +3428,23 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3468,7 +3468,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -3478,13 +3478,13 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -3527,7 +3527,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -3537,13 +3537,13 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -4307,7 +4307,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -4321,13 +4321,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 @@ -4410,7 +4410,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -4424,13 +4424,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 @@ -5857,7 +5857,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5873,7 +5873,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5889,7 +5889,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5920,7 +5920,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW-NEXT: movw $255, %ax ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5934,13 +5934,13 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: movw $255, %ax ; AVX512F-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5960,7 +5960,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW-NEXT: movw $255, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5974,13 +5974,13 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: movw $255, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5994,7 +5994,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF @@ -6021,7 +6021,7 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -6029,10 +6029,10 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -6048,7 +6048,7 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -6056,10 +6056,10 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -6075,15 +6075,15 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -6105,7 +6105,7 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -6113,19 +6113,19 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -6147,7 +6147,7 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -6155,19 +6155,19 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -6189,27 +6189,27 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -6234,7 +6234,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -6244,15 +6244,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -6302,7 +6302,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -6312,15 +6312,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -7225,7 +7225,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -7239,15 +7239,15 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 @@ -7345,7 +7345,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -7359,15 +7359,15 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 @@ -9071,7 +9071,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9089,7 +9089,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9107,7 +9107,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9134,13 +9134,13 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9155,13 +9155,13 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9176,7 +9176,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF @@ -9204,7 +9204,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 ; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -9212,10 +9212,10 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 ; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -9238,7 +9238,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -9246,15 +9246,15 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST-NEXT: kmovw %eax, %k2 ; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9272,7 +9272,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 @@ -9280,10 +9280,10 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 @@ -9305,7 +9305,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 @@ -9313,14 +9313,14 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -9395,7 +9395,7 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -9403,22 +9403,22 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9442,7 +9442,7 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -9450,22 +9450,22 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9489,31 +9489,31 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9539,7 +9539,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -9549,17 +9549,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -9616,7 +9616,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -9626,17 +9626,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -10677,7 +10677,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -10691,17 +10691,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 @@ -10814,7 +10814,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -10828,17 +10828,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 @@ -12789,7 +12789,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12801,7 +12801,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12813,7 +12813,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12833,10 +12833,10 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -12850,10 +12850,10 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -12867,7 +12867,7 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12890,16 +12890,16 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -12917,16 +12917,16 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -12994,29 +12994,29 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13043,29 +13043,29 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13135,21 +13135,21 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 @@ -13216,21 +13216,21 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 @@ -13373,25 +13373,25 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 @@ -13532,25 +13532,25 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index ad810c092bf55..3b43003b36da6 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -40,7 +40,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; SSE41-LABEL: var_rotate_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] ; SSE41-NEXT: psubq %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllq %xmm1, %xmm3 @@ -58,8 +58,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX1-LABEL: var_rotate_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -74,7 +73,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 @@ -661,34 +660,34 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { -; SSE-LABEL: splatvar_rotate_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,64] -; SSE-NEXT: psubq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psllq %xmm1, %xmm3 -; SSE-NEXT: psrlq %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: splatvar_rotate_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllq %xmm1, %xmm3 +; SSE2-NEXT: psrlq %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: splatvar_rotate_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq +; SSE41-LABEL: splatvar_rotate_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] +; SSE41-NEXT: psubq %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllq %xmm1, %xmm3 +; SSE41-NEXT: psrlq %xmm2, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: retq ; -; AVX2-LABEL: splatvar_rotate_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] -; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: splatvar_rotate_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; AVX512NOVLX-LABEL: splatvar_rotate_v2i64: ; AVX512NOVLX: # %bb.0: @@ -815,7 +814,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: splatvar_rotate_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -828,7 +827,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX-LABEL: splatvar_rotate_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -839,7 +838,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512F-LABEL: splatvar_rotate_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -850,7 +849,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VL-LABEL: splatvar_rotate_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -861,7 +860,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -872,7 +871,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VLBW-LABEL: splatvar_rotate_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1054,7 +1053,7 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v2i64: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper @@ -1138,7 +1137,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v4i32: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper @@ -1210,8 +1209,8 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_rotate_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1228,7 +1227,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; AVX512VBMI2-LABEL: constant_rotate_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1306,11 +1305,11 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_rotate_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1331,11 +1330,11 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; ; AVX512VBMI2-LABEL: constant_rotate_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index dae2cf382b820..dcf67f52d6422 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -17,8 +17,7 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: var_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 @@ -518,8 +517,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 @@ -534,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -649,7 +647,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 @@ -866,7 +864,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v4i64: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq @@ -930,7 +928,7 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v8i32: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq @@ -1003,8 +1001,8 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_rotate_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -1020,7 +1018,7 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX512VBMI2-LABEL: constant_rotate_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 4709525e71c27..967069d978927 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2337,19 +2337,47 @@ entry: } define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { -; SSE-LABEL: load_sext_8i1_to_8i32: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movzbl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: load_sext_8i1_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i1_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i1_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movzbl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [16,32,64,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_8i1_to_8i32: ; AVX1: # %bb.0: # %entry @@ -2368,7 +2396,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { ; AVX2-LABEL: load_sext_8i1_to_8i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2389,20 +2417,35 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X86-SSE-LABEL: load_sext_8i1_to_8i32: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movzbl (%eax), %eax -; X86-SSE-NEXT: movd %eax, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; X86-SSE-NEXT: pand %xmm2, %xmm1 -; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm1 -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: load_sext_8i1_to_8i32: +; X86-SSE2: # %bb.0: # %entry +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: movd %eax, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: load_sext_8i1_to_8i32: +; X86-SSE41: # %bb.0: # %entry +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movzbl (%eax), %eax +; X86-SSE41-NEXT: movd %eax, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X86-SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,2,4,8] +; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE41-NEXT: pand %xmm2, %xmm0 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [16,32,64,128] +; X86-SSE41-NEXT: pand %xmm2, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; X86-SSE41-NEXT: retl entry: %X = load <8 x i1>, ptr %ptr %Y = sext <8 x i1> %X to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index cdc2bbe56aee4..53b6aca3e9fcb 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1292,7 +1292,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; AVX512-LABEL: constant_shift_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,7] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -1452,7 +1452,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1545,7 +1545,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 6d19a81d9fd78..9a483c345f92c 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1385,7 +1385,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 @@ -1424,7 +1424,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512-LABEL: constant_shift_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,31,62] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -1440,7 +1440,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0] +; X86-AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 @@ -1583,7 +1583,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1790,8 +1790,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2068,8 +2067,7 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: shift32_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 7ea94678e0b8e..a2fe36e72f6b9 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -342,7 +342,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index ff41d883380a8..36a6226f8f4b9 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1977,7 +1977,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2065,7 +2065,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2153,7 +2153,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 43da0f15abaeb..ca8343cd4812c 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1198,7 +1198,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1290,7 +1290,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 5fe661c7e7778..e65f78e49dc8d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1309,7 +1309,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 71719e03c7c6d..74ba1d04161f8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1671,7 +1671,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1767,7 +1767,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1863,7 +1863,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 12f971fb83b56..22d9d0c33cd21 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -1066,7 +1066,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1154,7 +1154,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 76944994c87d1..389b9b56efc99 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1202,7 +1202,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index 19645fd08c946..d545cb77cba2e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1478,7 +1478,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1567,7 +1567,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1656,7 +1656,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index aca74b63fcb53..468fec66c028b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -370,7 +370,7 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0124: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,4] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -418,7 +418,7 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0142: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,4,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -469,7 +469,7 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0412: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,1,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -511,7 +511,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_4012: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -545,7 +545,7 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0451: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,5,1] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -580,7 +580,7 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_4015: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,5] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -1549,7 +1549,7 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_2456: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,1,2] ; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index cd68a3093bb16..d95bf2a48c3cd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1155,7 +1155,7 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10] +; AVX512VL-FAST-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10] ; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq ; @@ -1248,7 +1248,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] +; AVX512VL-FAST-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] ; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq ; @@ -1314,7 +1314,7 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7] +; AVX512VL-FAST-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7] ; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq ; @@ -1363,7 +1363,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: shuffle_v8i16_032dXXXX: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1] ; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1544,7 +1544,7 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: shuffle_v8i16_012dcde3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3] ; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1647,7 +1647,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] +; AVX512VL-FAST-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] ; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-NEXT: retq ; @@ -1698,7 +1698,7 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: shuffle_v8i16_XX4X8acX: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10] ; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index bac63f2ddb505..9a6d8c3366d98 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -57,7 +57,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -69,7 +69,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -112,7 +112,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -124,7 +124,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -167,7 +167,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -179,7 +179,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,3] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -219,7 +219,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -257,7 +257,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,5,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -295,7 +295,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -333,7 +333,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,7,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -374,7 +374,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -416,7 +416,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,9,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,9] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -457,7 +457,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -498,7 +498,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1434,7 +1434,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; ; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1452,7 +1452,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1470,7 +1470,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; ; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1488,7 +1488,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; ; XOPAVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] +; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1553,7 +1553,7 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1596,7 +1596,7 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1649,7 +1649,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1 ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1696,7 +1696,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2082,7 +2082,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2124,7 +2124,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2586,7 +2586,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2635,7 +2635,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2 ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2686,7 +2686,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2 ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2737,7 +2737,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2778,7 +2778,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3092,7 +3092,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3261,7 +3261,7 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_1 ; ; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3297,7 +3297,7 @@ define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_2 ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3336,7 +3336,7 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3375,7 +3375,7 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3419,7 +3419,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3464,7 +3464,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3510,7 +3510,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -3525,7 +3525,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3569,7 +3569,7 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,0,2,0,11,0,8,0,9,0,10,0,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3609,7 +3609,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,0,5,0,6,0,15,0,12,0,13,0,14,0,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3658,7 +3658,7 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3703,7 +3703,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,4,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -3718,7 +3718,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3759,7 +3759,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3809,7 +3809,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3853,7 +3853,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3897,7 +3897,7 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0 ; ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3940,7 +3940,7 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -3955,7 +3955,7 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3999,7 +3999,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,4,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4014,7 +4014,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4058,7 +4058,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4073,7 +4073,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4117,7 +4117,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4132,7 +4132,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4176,7 +4176,7 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4191,7 +4191,7 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4234,7 +4234,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4276,7 +4276,7 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4318,7 +4318,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4353,28 +4353,28 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,2,3,6,7,10,11,0,1,4,5,14,15,16,17,16,17,18,19,22,23,26,27,16,17,20,21,30,31] ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4388,7 +4388,7 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; XOPAVX2-NEXT: retq @@ -4417,7 +4417,7 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4432,7 +4432,7 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4476,7 +4476,7 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4491,7 +4491,7 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4535,7 +4535,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4550,7 +4550,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4594,7 +4594,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4609,7 +4609,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4654,7 +4654,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,7,4,6,7,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,3,7,4,6,7,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27] ; AVX2-FAST-ALL-NEXT: retq @@ -4668,7 +4668,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4713,7 +4713,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4728,7 +4728,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,0,12,12,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4772,7 +4772,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4787,7 +4787,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,0,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4831,7 +4831,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4846,7 +4846,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,0,12,12,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4919,7 +4919,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,8,9,4,5,6,11,12,13,8,9,12,13,14,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,5,8,9,4,5,6,11,12,13,8,9,12,13,14,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5023,7 +5023,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5064,7 +5064,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5099,28 +5099,28 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,8,9,0,1,6,7,2,3,14,15,18,19,22,23,26,27,24,25,16,17,22,23,18,19,30,31] ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5134,7 +5134,7 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; XOPAVX2-NEXT: retq @@ -5163,7 +5163,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5211,7 +5211,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5258,7 +5258,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5306,7 +5306,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5360,7 +5360,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,7,u,4,7,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,7,0,4,7,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] @@ -5378,7 +5378,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5431,7 +5431,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,4,u,6,4,u,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,4,0,6,4,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] @@ -5449,7 +5449,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5510,7 +5510,7 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2 ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5558,7 +5558,7 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1 ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5605,7 +5605,7 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5665,7 +5665,7 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5721,7 +5721,7 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,4,3,18,0,0,0,0,12,12,11,26,0,0,0,0] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5766,7 +5766,7 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,2,21,0,0,0,0,8,11,10,29,0,0,0,0] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5818,7 +5818,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,21,0,0,0,0,8,9,10,29,0,0,0,0] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5861,7 +5861,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,6,27,0,0,0,0,12,13,14,27] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5905,7 +5905,7 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,5,6,19,0,0,0,0,12,13,14,27,0,0,0,0] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5953,7 +5953,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5998,7 +5998,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6063,7 +6063,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,1,0,5,7,25,0,0,0,9,0,13,15,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6115,7 +6115,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,20,0,0,2,4,0,0,0,28,0,8,10,12,0] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6161,7 +6161,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6237,7 +6237,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6343,7 +6343,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6420,7 +6420,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6526,7 +6526,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6604,7 +6604,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6647,7 +6647,7 @@ define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6722,7 +6722,7 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u ; ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,19,0,4,4,21,0,15,0,27,0,12,12,29,0] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6995,7 +6995,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2 ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -7046,7 +7046,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -7452,16 +7452,16 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: PR24935: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,u,u,0,4,6,2] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,5,0,0,0,4,6,2] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,3,0,0,6,4,u] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,6,3,0,0,6,4,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25] ; AVX2-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -7477,13 +7477,13 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -7513,7 +7513,7 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -7564,7 +7564,7 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) { ; ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12] ; AVX512VL-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 1cfa5e6dfdff5..dbcb49507ea19 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -69,7 +69,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -125,7 +125,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,512] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -181,7 +181,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,3] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -237,7 +237,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,0,67108864] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -293,7 +293,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -349,7 +349,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,0,1536] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -405,7 +405,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,7] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -517,7 +517,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -573,7 +573,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,2560,0,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -629,7 +629,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,11,0,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -685,7 +685,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,201326592,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -741,7 +741,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -797,7 +797,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,3584,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -853,7 +853,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,15,0] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -952,7 +952,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0] +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,0,0,17] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -999,7 +999,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0] +; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} xmm1 = [0,0,0,4608] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1046,7 +1046,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,19] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1093,7 +1093,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} xmm1 = [0,335544320] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1140,7 +1140,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,21,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1187,7 +1187,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} xmm1 = [0,5632] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1234,7 +1234,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,23] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,27,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0] +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -2673,7 +2673,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,0,0,0,0,0,0,1] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2687,7 +2687,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; XOPAVX1-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,0,0,0,0,0,0,1] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2705,7 +2705,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vpmovsxwd {{.*#+}} xmm2 = [0,0,0,512] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2719,7 +2719,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; XOPAVX1-NEXT: vpmovsxwd {{.*#+}} xmm2 = [0,0,0,512] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2737,7 +2737,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2751,7 +2751,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2833,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2847,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] +; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3409,7 +3409,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -3422,7 +3422,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] ; AVX512VLBW-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -3458,7 +3458,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; XOPAVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] +; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -4454,7 +4454,7 @@ define <32 x i8> @shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_ ; ; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index b8188450ec7fc..bc95fd42e6b84 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1015,7 +1015,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0124: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1044,7 +1044,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,2] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,4,2] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1068,7 +1068,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0412: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,2] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,1,2] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1096,7 +1096,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_4012: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1135,7 +1135,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0451: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,5,1] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,5,1] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1168,7 +1168,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_4015: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,5] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,5] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1197,7 +1197,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2u35: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,5,3,5] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1227,7 +1227,7 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_1251: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,5,1] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,5,1] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1255,7 +1255,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1289,7 +1289,7 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,5,4] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1323,7 +1323,7 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3276: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,7,6] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,7,6] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1357,7 +1357,7 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,7,6] +; AVX512VL-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,7,6] ; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1381,7 +1381,7 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0415: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,5] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,1,5] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1410,7 +1410,7 @@ define <4 x i64> @shuffle_v4i64_2741(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2741: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,7,4,1] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1439,7 +1439,7 @@ define <4 x i64> @shuffle_v4i64_0437(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0437: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,3,7] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,3,7] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -2058,9 +2058,9 @@ define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: add_v4i64_0246_1357: ; AVX512VL-FAST-ALL: # %bb.0: # %entry -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,4,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 ; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -2114,9 +2114,9 @@ define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: add_v4i64_4602_5713: ; AVX512VL-FAST-ALL: # %bb.0: # %entry -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,4,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 ; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 02cad49d29fdb..fde3e0aecf5aa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2005,7 +2005,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] ; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2067,7 +2067,7 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2113,7 +2113,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2144,7 +2144,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_091b2d3f: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2188,7 +2188,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_09ab1def: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2644,7 +2644,7 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2809,7 +2809,7 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2837,7 +2837,7 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2859,7 +2859,7 @@ define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_7654fedc: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2887,7 +2887,7 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_fedc7654: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -2916,7 +2916,7 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -2939,7 +2939,7 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -2971,7 +2971,7 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_089abcde: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3006,7 +3006,7 @@ define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_0189abcd: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3044,7 +3044,7 @@ define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_01289abc: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3252,7 +3252,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_0dcd3f14: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,5,4,5,11,7,9,12] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,5,4,5,11,7,9,12] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3821,7 +3821,7 @@ define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) { ; ; AVX512VL-LABEL: lowhalf_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,14,3,14] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,14] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 9c0ffd4558fc8..dfa7f2dbdaeee 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -218,7 +218,7 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) { ; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28] ; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -253,7 +253,7 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] +; ALL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] ; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> @@ -316,7 +316,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, ptr %b) { ; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] +; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] ; ALL-NEXT: vpermt2d (%rdi), %zmm1, %zmm0 ; ALL-NEXT: retq %c = load <16 x i32>, ptr %b @@ -516,7 +516,7 @@ define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_u define <16 x i32> @shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29(<16 x i32> %a, <16 x i32> %b) { ; AVX512F-LABEL: shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index 24b1b42c2dc05..ca81317eb61a4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -42,20 +42,20 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_ ; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] ; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] -; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u] +; KNL-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,65535,0,0,65535,0,0,0,0,0,0,65535,0] ; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15] ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] -; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255] +; KNL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,0,0,65535,0,65535,65535,0,65535,65535,0,0,65535] ; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31] +; SKX-NEXT: vpmovsxbw {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1,2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,31] ; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> @@ -82,7 +82,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1 ; ; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] +; SKX-NEXT: vpmovsxbw {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] ; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index f4cc8522adec5..85948f70ba5fa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -212,7 +212,7 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] @@ -220,7 +220,7 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -228,13 +228,13 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512BW-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u,33,35,34,37,39,38,41,43,42,45,47,46,u,u,u,u,49,51,50,53,55,54,57,59,58,61,63,62,u,u,u,u] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] @@ -242,7 +242,7 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq @@ -563,7 +563,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] ; AVX512F-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -590,7 +590,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] ; AVX512BW-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -617,7 +617,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512DQ-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -659,14 +659,14 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -686,14 +686,14 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -713,14 +713,14 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -758,7 +758,7 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -785,7 +785,7 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -812,7 +812,7 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -842,14 +842,14 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -869,14 +869,14 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -896,14 +896,14 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -931,7 +931,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -948,7 +948,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -965,7 +965,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -989,10 +989,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1006,10 +1006,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -1023,10 +1023,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -1111,10 +1111,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1128,10 +1128,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -1145,10 +1145,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -1370,7 +1370,7 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1465,7 +1465,7 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; ; AVX512BW-LABEL: PR54562_ref: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,2,3,4,4,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: retq @@ -1510,7 +1510,7 @@ define void @PR54562_mem(ptr %src, ptr %dst) { ; ; AVX512BW-LABEL: PR54562_mem: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,2,3,4,4,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 008593a239f86..2387e05729661 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1056,171 +1056,106 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_08084c4c: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_08084c4c: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_08084c4c: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_8823cc67: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_8823cc67: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_8823cc67: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_9832dc76: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_9832dc76: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_9832dc76: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_9810dc54: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_9810dc54: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_9810dc54: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_08194c5d: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_08194c5d: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_08194c5d: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_2a3b6e7f: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_2a3b6e7f: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_2a3b6e7f: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_08192a3b: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_08192a3b: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_08192a3b: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_08991abb: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_08991abb: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_08991abb: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_091b2d3f: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_091b2d3f: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_091b2d3f: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_09ab1def: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_09ab1def: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_09ab1def: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -1647,19 +1582,12 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_6caa87e5: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_6caa87e5: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] +; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 917f26dd3cadf..0c76c14afb0ae 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -627,7 +627,7 @@ define <8 x i32> @concat_self_v8i32(<4 x i32> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} @@ -636,7 +636,7 @@ define <8 x i32> @concat_self_v8i32(<4 x i32> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] ; AVX512-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: ret{{[l|q]}} @@ -696,15 +696,15 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: popl %ebp ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: bit_reversal_permutation: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,4,0,12,0,2,0,10,0,6,0,14,0] -; X86-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,0,9,0,5,0,13,0,3,0,11,0,7,0,15,0] -; X86-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; X86-AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; X86-AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; X86-AVX512-NEXT: retl +; AVX512-LABEL: bit_reversal_permutation: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,2,10,6,14] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: ret{{[l|q]}} ; ; X64-AVX1-LABEL: bit_reversal_permutation: ; X64-AVX1: # %bb.0: @@ -742,16 +742,6 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vmovaps %ymm4, %ymm1 ; X64-AVX2-NEXT: vmovaps %ymm5, %ymm3 ; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: bit_reversal_permutation: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,2,10,6,14] -; X64-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,9,5,13,3,11,7,15] -; X64-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; X64-AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; X64-AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; X64-AVX512-NEXT: retq %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> ret <16 x i64> %v1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index f2e42d51a7eb6..1a46423f4d113 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -889,7 +889,7 @@ define void @PR63030(ptr %p0) { ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0 -; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,8,0,0,0,0,0,0,0,9,0,1,0,1,0] +; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] ; X86-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1 ; X86-AVX512-NEXT: vmovdqa64 %zmm1, (%eax) ; X86-AVX512-NEXT: vzeroupper @@ -913,7 +913,7 @@ define void @PR63030(ptr %p0) { ; X64-AVX512-LABEL: PR63030: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] ; X64-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; X64-AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; X64-AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index a5ba81d516f72..bbc87eda82a5d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -21,19 +21,19 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) { ; X86-LABEL: combine_vpermt2var_32i16_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X86-NEXT: vpmovsxbw {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X64-NEXT: vpmovsxbw {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 %m) @@ -146,7 +146,7 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: combine_permi2q_pshufb_as_permi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] ; CHECK-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} @@ -159,16 +159,14 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0] -; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; X86-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,0,5,0,0,12,0,14] ; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] ; X86-NEXT: retl ; ; X64-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X64: # %bb.0: -; X64-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,5,14,7,12,5,14] -; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; X64-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,0,5,0,0,12,0,14] ; X64-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] @@ -293,7 +291,7 @@ define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: combine_vpermi2var_32i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1, i32 -1) @@ -304,7 +302,7 @@ define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] ; CHECK-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 23c37af1db2f7..388511ce0741f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -16,19 +16,19 @@ define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16> define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16i16_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} -; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %x0, <16 x i16> %x1, i16 %m) @@ -39,7 +39,7 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) { ; CHECK-LABEL: combine_vpermi2var_16i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) @@ -50,7 +50,7 @@ define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index 27375e96b3160..f53b1eeaf8f54 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -92,42 +92,42 @@ define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X86-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X86-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_permvar_8i64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-AVX512BW-NEXT: retq @@ -255,39 +255,39 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] ; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] ; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] ; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] ; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 %m) @@ -595,28 +595,28 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16i32_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 %m) @@ -920,7 +920,7 @@ define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x do define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 78ae0d23da978..9b32005927ace 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -116,7 +116,7 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1 define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: combine_permi2q_pshufb_as_permi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] ; CHECK-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index d285d07e66049..d02a9a64b0302 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2698,13 +2698,13 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [u,4,5,30] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4,5,30] ; SSE41-NEXT: pinsrd $0, %edi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,4,5,30] +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,5,30] ; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 @@ -3282,7 +3282,7 @@ define void @PR45604(ptr %dst, ptr %src) { ; SSE41-NEXT: movdqa (%rsi), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,0,11,0,u,0,11,0] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [0,11,0,11] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero @@ -3325,7 +3325,7 @@ define void @PR45604(ptr %dst, ptr %src) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] +; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [151519488,185205506,218891524,252577542] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index aca50c461a7a1..c977929b21f45 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0] ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} @@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 -; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] +; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 @@ -111,7 +111,7 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 @@ -123,7 +123,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 @@ -135,7 +135,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 @@ -155,7 +155,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -169,7 +169,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -183,7 +183,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 @@ -207,7 +207,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -226,7 +226,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -262,7 +262,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -283,7 +283,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -320,7 +320,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -340,7 +340,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -372,7 +372,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -387,7 +387,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -423,7 +423,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -438,7 +438,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -542,7 +542,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,2,10,u,3,u,2,u] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,2,10,0,3,0,2,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -557,7 +557,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 @@ -571,7 +571,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] +; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax @@ -632,7 +632,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -646,7 +646,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -659,7 +659,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax @@ -677,7 +677,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 @@ -719,8 +719,8 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -765,7 +765,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 @@ -781,7 +781,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -794,7 +794,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index 6941585c6a5f0..e83c1e8482773 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -19,7 +19,7 @@ define <64 x i8> @f1(ptr %p0) { ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] @@ -163,7 +163,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] @@ -173,7 +173,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 @@ -310,7 +310,7 @@ define <64 x i8> @f3(ptr %p0) { ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm4 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -343,7 +343,7 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] ; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] @@ -446,7 +446,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] @@ -456,7 +456,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 7fa64520f9314..391c55b914c87 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -43,8 +43,7 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -225,8 +224,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -481,8 +479,7 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -816,8 +813,7 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -998,8 +994,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1224,8 +1219,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1595,7 +1589,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1837,8 +1831,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm7 = [255,255] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -2140,8 +2133,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2300,8 +2292,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255] -; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm8 = [255,255] ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index f93f5682df826..3ec2ba3de9a2f 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -56,7 +56,7 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -77,28 +77,16 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v2i64_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v2i64_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v2i64_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -186,7 +174,7 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -208,30 +196,17 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -343,7 +318,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 @@ -388,8 +363,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -425,8 +399,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -613,7 +586,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 @@ -705,8 +678,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4294967295,4294967295] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -762,7 +734,7 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; AVX2-FAST-ALL-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 ; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -2293,28 +2265,16 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v2i64_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v2i64_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v2i64_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2459,30 +2419,17 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2645,8 +2592,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -2840,8 +2786,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -3143,8 +3088,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -3434,8 +3378,7 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -3919,8 +3862,7 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; AVX1-LABEL: trunc_packus_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 14f724fc3b8c7..9ba0cce979a26 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -58,7 +58,7 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -68,7 +68,7 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 @@ -181,7 +181,7 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -191,7 +191,7 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 @@ -335,7 +335,7 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] ; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm7 @@ -357,7 +357,7 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 @@ -606,7 +606,7 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] ; SSE41-NEXT: movdqa %xmm6, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm9 @@ -650,7 +650,7 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: movapd %xmm1, %xmm7 ; SSE41-NEXT: xorpd %xmm3, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] ; SSE41-NEXT: movapd %xmm7, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 @@ -872,12 +872,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -886,10 +884,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -898,10 +896,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -1009,12 +1007,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1024,10 +1020,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1037,10 +1033,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1200,15 +1196,13 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -1386,15 +1380,13 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -1682,8 +1674,7 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32767,32767] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [32767,32767] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -1692,8 +1683,7 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -2033,29 +2023,16 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_ssat_v2i64_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v2i64_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: retq +; AVX-LABEL: trunc_ssat_v2i64_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2192,31 +2169,17 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2373,15 +2336,13 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -2563,15 +2524,13 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -2863,8 +2822,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -2873,8 +2831,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -3161,8 +3118,7 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -3171,8 +3127,7 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 @@ -3655,8 +3610,7 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-LABEL: trunc_ssat_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 @@ -3680,8 +3634,7 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 @@ -4518,12 +4471,12 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; ; SSE41-LABEL: trunc_ssat_v16i32_v16i24: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8388607,8388607,8388607,8388607] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,127,65535,127,65535,127,65535,127] ; SSE41-NEXT: pminsd %xmm4, %xmm3 ; SSE41-NEXT: pminsd %xmm4, %xmm2 ; SSE41-NEXT: pminsd %xmm4, %xmm1 ; SSE41-NEXT: pminsd %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4286578688,4286578688,4286578688,4286578688] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [0,65408,0,65408,0,65408,0,65408] ; SSE41-NEXT: pmaxsd %xmm4, %xmm0 ; SSE41-NEXT: pmaxsd %xmm4, %xmm1 ; SSE41-NEXT: pmaxsd %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index f40a7e39b9869..8ddeaf1bf7aa1 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1429,7 +1429,7 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -1587,7 +1587,7 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: pminud %xmm0, %xmm2 ; SSE41-NEXT: movdqa 32(%rdi), %xmm1 @@ -2986,7 +2986,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -3076,7 +3076,7 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -3190,7 +3190,7 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [255,255,255,255] ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: movdqa (%rdi), %xmm0 @@ -3291,7 +3291,7 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm0 = [255,255,255,255] ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: movdqa (%rdi), %xmm2 @@ -3729,7 +3729,7 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v32i32_v32i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: movdqa 80(%rdi), %xmm0 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: movdqa 64(%rdi), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 3294c7ffee40d..57dd39e3fd4bb 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -232,7 +232,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) { ; ; SSE41-LABEL: trunc8i64_8i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -505,7 +505,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; ; SSE41-LABEL: trunc8i32_8i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -789,7 +789,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) { ; ; SSE41-LABEL: trunc16i32_16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pmovsxwd {{.*#+}} xmm4 = [255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index 3dc43031cea9e..882b816370478 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -150,7 +150,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -351,7 +351,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 18bd9e72fe3ea..a147c914122b1 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -512,7 +512,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) { ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; SSE41-NEXT: pmaxuw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 7abd28841b9ca..74926f46ffa43 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2493,7 +2493,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63] +; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [63,63] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll index 050b3329a4abb..901f7e4a00eb5 100644 --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -83,7 +83,7 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) { ; AVX-LABEL: cmp_sel_Cplus1_or_C_vec: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,18446744073709551614] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %cond = icmp eq <4 x i32> %x, %y @@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] +; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,0] ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index ffc929c1237cd..032dd0626b9a0 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -531,8 +531,7 @@ define <4 x i64> @blend_splat1_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm3, %xmm3 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] -; XOP-NEXT: # xmm4 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 @@ -861,8 +860,7 @@ define <4 x i64> @blend_splat_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm3, %xmm3 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] -; XOP-NEXT: # xmm4 = mem[0,0] +; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 @@ -1021,7 +1019,7 @@ define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,4] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1184,7 +1182,7 @@ define <4 x i64> @blend_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [2,4,32768,1] +; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [2,4,32768,1] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1231,7 +1229,7 @@ define <8 x i32> @blend_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,8,4,8,1024,2,4096] +; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,8,4,8,1024,2,4096] ; AVX512F-NEXT: vptestnmd %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmd %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index f45ee75e74f31..ce3dc8cc873cc 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -686,21 +686,33 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; Test to make sure we don't try to insert a new setcc to swap the operands ; of select with all zeros LHS if the setcc has additional users. define void @vselect_allzeros_LHS_multiple_use_setcc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, ptr %p1, ptr %p2) { -; SSE-LABEL: vselect_allzeros_LHS_multiple_use_setcc: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, (%rdi) -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: retq +; SSE2-LABEL: vselect_allzeros_LHS_multiple_use_setcc: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSE41-LABEL: vselect_allzeros_LHS_multiple_use_setcc: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, (%rdi) +; SSE41-NEXT: movdqa %xmm0, (%rsi) +; SSE41-NEXT: retq ; ; AVX-LABEL: vselect_allzeros_LHS_multiple_use_setcc: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8] +; AVX-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8] ; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/widen_arith-5.ll b/llvm/test/CodeGen/X86/widen_arith-5.ll index 8f1cbd1767f88..dd75a4ea2cf22 100644 --- a/llvm/test/CodeGen/X86/widen_arith-5.ll +++ b/llvm/test/CodeGen/X86/widen_arith-5.ll @@ -13,7 +13,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind { ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,u] +; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 06140b2395fca..cb038b3211abd 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -379,7 +379,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1OR2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u] +; AVX1OR2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,5,9,13,13,5,12,13] ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -388,7 +388,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] ; AVX1OR2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1OR2-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u] +; AVX1OR2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,7,11,15,7,15,6,7] ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] @@ -404,7 +404,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u] +; AVX512-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,7,11,15,7,15,6,7] ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -722,7 +722,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -871,7 +871,7 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(ptr %ptr){ ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] @@ -1745,9 +1745,9 @@ define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat4_v8f32_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) @@ -1805,9 +1805,9 @@ define void @splat4_v8i32_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat4_v8i32_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index 8c9dc90d2a71d..40c537ddec9c9 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -2065,7 +2065,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -2329,7 +2329,7 @@ define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,u,u,1,u,u,u] +; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -2855,7 +2855,7 @@ define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2,u,3,3,u,4,4,u,5] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero @@ -3824,7 +3824,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3841,7 +3841,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3856,7 +3856,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3871,7 +3871,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] +; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3888,7 +3888,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3903,7 +3903,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3919,7 +3919,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -4217,7 +4217,7 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -4233,7 +4233,7 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -4398,7 +4398,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -4414,7 +4414,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -4521,7 +4521,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -4855,7 +4855,7 @@ define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -5009,7 +5009,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u] +; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5024,7 +5024,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -5369,7 +5369,7 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,u,u] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6731,7 +6731,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -7178,10 +7178,10 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u] +; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,u,u,3,u,u,u] +; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7214,7 +7214,7 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -7582,7 +7582,7 @@ define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,5,3,7] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7] ; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 8ab53140eb911..a5782c0b4d1cc 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -750,7 +750,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -761,7 +761,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax @@ -868,7 +868,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -881,7 +881,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -968,7 +968,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -978,7 +978,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 @@ -989,7 +989,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 @@ -1001,7 +1001,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1450,7 +1450,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -1466,8 +1466,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1640,7 +1639,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1759,7 +1758,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1875,7 +1874,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1984,7 +1983,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2010,7 +2009,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2032,7 +2031,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2136,7 +2135,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2162,7 +2161,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2184,7 +2183,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2288,7 +2287,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2314,7 +2313,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2336,7 +2335,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -3001,8 +3000,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551360] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3150,7 +3148,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX-NEXT: vpmovsxwd {{.*#+}} xmm3 = [4294967040,4294967295,4294967295,4294967040] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -3295,7 +3293,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3313,12 +3311,10 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [255,0,255,0] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3444,7 +3440,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 @@ -3463,12 +3459,10 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [255,0,18446744073709551615,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3656,7 +3650,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -3671,7 +3665,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -3859,7 +3853,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -3875,7 +3869,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4086,7 +4080,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4102,7 +4096,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4292,7 +4286,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4308,7 +4302,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4445,7 +4439,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4584,7 +4578,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 @@ -4735,7 +4729,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4753,7 +4747,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4767,7 +4761,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4784,7 +4778,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] @@ -4880,7 +4874,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4901,7 +4895,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4920,7 +4914,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4940,7 +4934,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4958,7 +4952,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4972,7 +4966,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4989,7 +4983,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -5098,7 +5092,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5116,7 +5110,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5129,7 +5123,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5219,7 +5213,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -5237,8 +5231,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5255,8 +5248,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5270,7 +5262,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -5373,7 +5365,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5391,7 +5383,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5404,7 +5396,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,11] +; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 @@ -5418,8 +5410,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] -; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5521,8 +5512,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5539,8 +5529,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5552,7 +5541,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,10,0] +; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 @@ -5565,7 +5554,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,0] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -6209,7 +6198,7 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6328,7 +6317,7 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6447,7 +6436,7 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6656,7 +6645,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6671,7 +6660,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6686,7 +6675,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6780,7 +6769,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6795,7 +6784,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6811,7 +6800,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6897,7 +6886,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6912,7 +6901,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -7002,7 +6991,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7017,7 +7006,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7032,7 +7021,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -7098,7 +7087,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -7113,7 +7102,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -7192,7 +7181,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7207,7 +7196,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index c362bdaa3217d..b6a9947b696fb 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -643,7 +643,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -652,7 +652,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] @@ -737,7 +737,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -747,7 +747,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -1190,7 +1190,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -1202,8 +1202,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1339,7 +1338,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1428,7 +1427,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1515,7 +1514,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1678,7 +1677,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1697,7 +1696,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1716,7 +1715,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] ; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1791,7 +1790,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1810,7 +1809,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1829,7 +1828,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2398,8 +2397,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2528,7 +2526,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967040,4294967295,4294967295,4294967040] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -2651,7 +2649,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2665,12 +2663,10 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm1 = [255,0,255,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand (%rdi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2779,7 +2775,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vmovaps 32(%rsi), %ymm2 @@ -2795,12 +2791,10 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [255,0,18446744073709551615,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2812,7 +2806,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX512F-NEXT: vpternlogq $202, (%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2827,7 +2821,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq $202, (%rdi), %xmm0, %xmm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2967,7 +2961,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3122,7 +3116,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3306,7 +3300,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3464,7 +3458,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3579,7 +3573,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3692,7 +3686,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3821,7 +3815,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -3835,7 +3829,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -3849,7 +3843,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3928,7 +3922,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3945,7 +3939,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3962,7 +3956,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3977,7 +3971,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -3991,7 +3985,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -4005,7 +3999,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -4091,7 +4085,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -4105,7 +4099,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -4119,7 +4113,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -4190,7 +4184,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -4203,8 +4197,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4216,8 +4209,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4229,7 +4221,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,8,9,10,11,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,8,9,10,11,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4314,7 +4306,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -4328,7 +4320,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -4342,7 +4334,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -4424,8 +4416,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 @@ -4438,8 +4429,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 @@ -4452,7 +4442,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,10,0,12,13,u,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,12,13,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -4918,7 +4908,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,u,10,u,0,u,14,u] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,10,0,0,0,14,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -5020,7 +5010,7 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5122,7 +5112,7 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,1,2,3,32,5,6,7,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,32,25,26,27,32,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,32,5,6,7,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,32,25,26,27,32,29,30,31] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5226,7 +5216,7 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,32,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,32,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5323,7 +5313,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5434,7 +5424,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5448,7 +5438,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5462,7 +5452,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5526,7 +5516,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5539,7 +5529,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5552,7 +5542,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5609,7 +5599,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5622,7 +5612,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5635,7 +5625,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512BW-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5702,7 +5692,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5716,7 +5706,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5730,7 +5720,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5786,7 +5776,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5799,7 +5789,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5812,7 +5802,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5961,7 +5951,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5975,7 +5965,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -5989,7 +5979,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)