diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 09b951befac9f..205fd24e6d402 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6104,7 +6104,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && + (MI.getOpcode() != X86::INSERTPSrr || Alignment >= Align(4))) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index 6a10d855ea95c..4ce092c099b08 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -184,14 +184,12 @@ define <4 x float> @nofold_insertps(ptr %a, <4 x float> %b) { ; X86-LABEL: nofold_insertps: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovups (%eax), %xmm1 -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; X86-NEXT: vinsertps $48, 8(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0] ; X86-NEXT: retl ; ; X64-LABEL: nofold_insertps: ; X64: ## %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm1 -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; X64-NEXT: vinsertps $48, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, ptr %a, align 1 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32>